Exemple #1
0
    def initializeModels(self):
        try:
            if self.kmeansDF:
                logger.info("Already loaded this DataFrame")
                pass
        except AttributeError:
            self.kmeansDF = None

        commandsDF = self.bashDF.map(lambda row: Row(date=row.date,
                                                     source=row.source,
                                                     username=row.username,
                                                     exec_as=row.exec_as,
                                                     srcip=row.srcip,
                                                     command=row.command.split(" "))).toDF()
        commandsDF.cache()

        word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="command", outputCol="features")
        w2model = word2Vec.fit(commandsDF)
        resultDF = w2model.transform(commandsDF)
        resultDF.cache()

        kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3)
        kmodel = kmeans.fit(resultDF)

        kmeansDF = kmodel.transform(resultDF)
        kmeansDF.cache()
        kmeansDF.coalesce(1).write.parquet('/user/jleaniz/ml/kmeans', mode='append')

        outliers = kmeansDF.groupBy("prediction").count().filter('count < 10').withColumnRenamed("prediction", "cluster")

        self.outlierCmds = outliers.join(kmeansDF, kmeansDF.prediction == outliers.cluster)
Exemple #2
0
    def test_kmeans_deterministic(self):
        from pyspark.mllib.clustering import KMeans

        X = range(0, 100, 10)
        Y = range(0, 100, 10)
        data = [[x, y] for x, y in zip(X, Y)]
        clusters1 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
        clusters2 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
        centers1 = clusters1.centers
        centers2 = clusters2.centers
        for c1, c2 in zip(centers1, centers2):
            # TODO: Allow small numeric difference.
            self.assertTrue(array_equal(c1, c2))
    def fit(self, Z):
        """Compute k-means clustering.

        Parameters
        ----------
        Z : ArrayRDD or DictRDD containing array-like or sparse matrix
            Train data.

        Returns
        -------
        self
        """
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))
        if self.init == 'k-means||':
            self._mllib_model = MLlibKMeans.train(
                X.unblock(),
                self.n_clusters,
                maxIterations=self.max_iter,
                initializationMode="k-means||")
            self.cluster_centers_ = self._mllib_model.centers
        else:
            models = X.map(lambda X: super(SparkKMeans, self).fit(X))
            models = models.map(lambda model: model.cluster_centers_).collect()
            return super(SparkKMeans, self).fit(np.concatenate(models))
Exemple #4
0
    def train_model(self, dataframe, k, model_name):
        '''
        use data to train model
        :param dataframe: all columns for train
        :param k:k value
        :param model_name:the trained model
        :return:None
        '''

        data = self.prepare_data(dataframe)

        # train to get model
        model = KMeans.train(data, k)

        # create model saving path
        path = self.base + model_name

        # try to delete the old model if it exists
        try:
            import subprocess
            subprocess.call(["hadoop", "fs", "-rm", "-f", path])
        except:
            pass
        # save new model on hdfs
        model.save(self.sc, path)
        # print all cluster of the model
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)
Exemple #5
0
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
def train_subquantizers(sc, split_vecs, M, subquantizer_clusters, model, seed=None):
    """
    Project each data point into it's local space and compute subquantizers by clustering
    each fine split of the locally projected data.
    """
    b = sc.broadcast(model)

    def project_local(x):
        x = np.concatenate(x)
        coarse = b.value.predict_coarse(x)
        return b.value.project(x, coarse)

    projected = split_vecs.map(project_local)

    # Split the vectors into the subvectors
    split_vecs = projected.map(lambda x: np.split(x, M))
    split_vecs.cache()

    subquantizers = []
    for split in xrange(M):
        data = split_vecs.map(lambda x: x[split])
        data.cache()
        sub = KMeans.train(data, subquantizer_clusters, initializationMode='random', maxIterations=10, seed=seed)
        data.unpersist()
        subquantizers.append(np.vstack(sub.clusterCenters))

    return (subquantizers[:len(subquantizers) / 2], subquantizers[len(subquantizers) / 2:])
Exemple #7
0
    def test_kmeans(self):
        from pyspark.mllib.clustering import KMeans

        data = [[0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0]]
        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # Load in pickled noun to vector dictionary
    logger.info('Loading pickled noun to vector dictionary')
    # Load noun to vector dictionary
    with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as f:
        noun_to_vect_dict = pickle.load(f)

    # Create vectors array
    vectors = noun_to_vect_dict.values()

    # Initialize Spark Context
    sc = ps.SparkContext('local[*]')
    # Load data
    data = sc.parallelize(vectors, 1024)

    # Create and fit a KMeans model to the data
    logger.info('Fitting KMeans model')
    kmeans_model = KMeans.train(data, N_CLUSTERS, maxIterations=10, runs=10,
                                initializationMode='k-means||')

    # Create a list of labels corresponding to vectors
    logger.info('Labeling vectors')
    labels = [kmeans_model.predict(vector) for vector in vectors]
    # Write to text file
    logger.info('Writing labels to file')
    with open(path.join(OUT_FILE_LOC, 'labels.txt'), 'w') as f:
        for label in labels:
            f.write(str(label) + '\n')
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info('Loading pickled noun to vector dictionary')
    # Load noun to vector dictionary
    with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as pickled:
        noun_to_vect_dict = pickle.load(pickled)
    # Create vector array from mapping
    vectors = np.array(noun_to_vect_dict.values())
    max_k = int(sqrt(len(vectors) / 2.0))

    # Define search space for k
    numbers_of_clusters = reversed(range(MIN_K, max_k))

    # For each k
    for i, k in enumerate(numbers_of_clusters):
        # Initialize Spark Context
        sc = ps.SparkContext()
        # Load data
        data = sc.parallelize(vectors, 1024)

        logger.info('Trial %i of %i, %i clusters', (i + 1), max_k - 1, k)
        # Calculate cluster
        kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10,
                                    initializationMode='k-means||')
        logger.info('Calculating WSSSE')
        # Calculate WSSSE
        WSSSE = data.map(lambda point: error(kmeans_model, point)) \
                    .reduce(lambda x, y: x + y)
        logger.info('Writing WSSSE')
        # Write k and WSSSE
        with open(path.join(OUT_FILES_LOC, 'elbow_data.txt'), 'a') as elbow_data:
            elbow_data.write(str(k) + '\t' + str(WSSSE) + '\n')

        sc.stop()
Exemple #10
0
def KMeansModel(dataPath, label, k, character, master):
    sc = SparkContext(master)
    data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))

    if label == 0:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
    else:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
    model = km.train(train_data, k)
    predict_data = train_data.collect()
    train = len(predict_data)
    acc = 0
    
    for i in range(len(label_sum)):
        ksum = np.zeros(k, dtype = int)
        cur_label = label_sum[i][0]
        for j in range(train):
            if label[j] == cur_label:
                ksum[model.predict(predict_data[j])] += 1
        acc += max(ksum)

    string = "KMeans Result: \n"
    center = model.centers
    for i in range(k):
        cur = str(i) + ":" + str(center[i]) + '\n'
        string += cur  
    string = string + "Acc: " + str((float(acc)/train) * 100) + "%"    
    sc.stop()
    return string
def kMeans(vecs, clusterNum):
	clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random")

	if pv.outputDebugMsg:
		Utils.logMessage("\nKmean cluster finished")

	return clusters
Exemple #12
0
def clusterKMeanSpark(matrix,k):
	m = transformInRealMatrix(matrix)
	sc = SparkContext(appName="Jsonizer: Remove stop words")
	parsedData = sc.parallelize(m)
	y = []
	x = []
	clustersControl = range(k,k+1)
	for kc in clustersControl:
		clusters = KMeans.train(parsedData, kc, maxIterations=50000,runs=200, initializationMode="k-means||",epsilon=0.0001)
		clu = []

		def error(point,clust):
		    center = clust.centers[clust.predict(point)]
		    return sqrt(sum([x**2 for x in (point - center)]))


		WSSSE = parsedData.map(lambda point: error(point,clusters)).reduce(lambda x, y: x + y)
		for n in m:
			clu += [clusters.predict(np.array(n))]

		x += [kc]
		y += [WSSSE]

		#print(kc,WSSSE)

	#plt.plot(x,y)
	#plt.ylabel('some numbers')
	#plt.show()

	ret = [[] for i in range(0,max(clu)+1)]
	for i in range(0,len(clu)):
		ret[clu[i]] += [i]
	sc.stop()
	return ret
def kmeans(iterations, theRdd):
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))
    clusters = KMeans.train(theRdd, iterations, maxIterations=10,
            runs=10, initializationMode="random")
    WSSSE = theRdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    return WSSSE, clusters
Exemple #14
0
def main(arg1, arg2):
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile(arg1)
    data = lines.map(parseVector)
    k = int(arg2)
    model = KMeans.train(data, k)
    print("Final centers: " + str(model.clusterCenters))
    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
def spark_KMeans(train_data):
    maxIterations = 10
    runs = 20
    numClusters = [2,3,4,5,6,7,8,9,10,11,12,13,14]
    errors = []
    for k in numClusters:
        model = KMeans.train(train_data, k, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
        WSSSE = model.computeCost(train_data)
        errors.append(WSSSE)

    plt.plot(numClusters, errors, 'ro')
    plt.xlabel(r'k')
    plt.ylabel(r'inertia')
    plt.title(r'inertia v.s. k')
    plt.savefig('kmeans_cross_validation.png')

    bestModel = KMeans.train(train_data, 6, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
    return bestModel
def cluster_data(sc, qc):
	drivers = read_file_path(BASE_PATH)
	print "Number of drivers: %d" % len(drivers)

	# Load and parse the data
	for i, dr in enumerate(drivers):
		# extract driver number from path
		dr_num = re.search("[0-9]+$", dr.strip())

		if dr_num:
			dr_num = dr_num.group(0)
			if dr_num == '1018':
				continue
		else:
			print 'driver number error for %s' % dr 
			continue

		dr_data = sc.textFile("hdfs://" + dr + "/" + dr_num + "_all_trips.txt")

		data = dr_data.map(lambda row: [float(x) for x in row.split(',')])

		if i == 0:
			all_data = data
		else:
			all_data = all_data.union(data)

		data.unpersist()

	print 'Total number of records: %d' % all_data.count()

	# Build the model (cluster the data), k = Number of clusters
	k = 5 
	t = time()
	clusters = KMeans.train(all_data, k, maxIterations=100, runs=100, initializationMode="random", )
	print 'KMeans took %.2f seconds' % (time() - t)

	# Compute cost
	WSSSE_map = all_data.map(lambda point: error(point, clusters))

	# Join cluster ID to original data
	all_data_w_cluster = all_data.map(lambda point: np.hstack((point, get_cluster_id(clusters, point))))

	# all_data_w_cluster.saveAsTextFile("hdfs:///usr/local/spark/kmeans/results.txt")

	for i in xrange(0,k):
		subset = all_data_w_cluster.filter(lambda x: x[-1] == i)
		print "Number of items in cluster %d: %d" % (i, subset.count())
		# Computer functions on different features:
		all_features_average = subset.sum() / subset.count()
		print 'Average of all features'
		print all_features_average
	
	WSSSE = all_data.map(lambda point: error(point, clusters)).reduce(lambda x, y: x + y)
	print("Within set sum of squared error: " + str(WSSSE))
Exemple #17
0
def k_means(loadTrainingFilePath, sc):
	# Load and parse the data
	loadTrainingFilePath = "../data/kmeans_data.txt"
	data = sc.textFile(loadTrainingFilePath)
	parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
	# Build the model (cluster the data)
	clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random")

	WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

	print("Within Set Sum of Squared Error = " + str(WSSSE))
def build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs):
    """Perform the clustering of vectors using K-means.

    Returns:
        k means model learned from the training data in
            tfidf_vectors_rdd

    """

    # Build the model (cluster the training data)
    return KMeans.train(tfidf_vectors_rdd, num_clusters, maxIterations=max_iterations, runs=runs)
Exemple #19
0
 def test_clustering(self):
     from pyspark.mllib.clustering import KMeans
     data = [
         self.scipy_matrix(3, {1: 1.0}),
         self.scipy_matrix(3, {1: 1.1}),
         self.scipy_matrix(3, {2: 1.0}),
         self.scipy_matrix(3, {2: 1.1})
     ]
     clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
     self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
     self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
def main(noun_file_loc, model_file_loc, percent, n_trials, out_files_loc):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info('Loading Word2Vec model')
    # Load trained Word2Vec model
    model = Word2Vec.load('model_file_loc')

    logger.info('Reading in list of nouns')
    # Read in list of sorted nouns
    sorted_nouns = []
    with open(noun_file_loc, 'r') as f:
        for line in f:
            sorted_nouns += line
    # Count number of nouns
    n_nouns = len(sorted_nouns)

    # Create dictionary to map nouns to vectors
    noun_to_vect_dict = {}
    # Calculate index to stop slice as percentage of total nouns
    n_nouns_to_keep = int(n_nouns * percent / 100.)
    logger.info('Keeping %i nouns, %i percent of %i',
                n_nouns_to_keep, percent, n_nouns)
    # Add nouns and vectors to dictionary
    for noun in sorted_nouns[0:n_nouns_to_keep]:
        noun_to_vect_dict[noun] = model[noun]

    vectors = np.array(noun_to_vect_dict.values())

    # Initialize Spark Context
    sc = ps.SparkContext('local[4]')
    # Load data
    data = sc.parallelize(vectors)

    # Define search space for k
    ns_clusters = [int(x) for x in np.linspace(2, n_nouns, n_trials)]
    # Open WSSSEs output file
    with open(path.join(out_files_loc, 'elbow_data.txt'), 'w') as elbow_data:
        # For each k
        for i, k in enumerate(ns_clusters):
            logger.info('Trial %i of %i, %i clusters', (i + 1), n_trials, k)
            # Calculate cluster
            kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10,
                                        initalizationMode='k-means||')
            # Calculate WSSSE
            WSSSE = data.map(lambda point: error(kmeans_model, point)) \
                        .reduce(lambda x, y: x + y)
            # Save centroids
            with open(path.join(out_files_loc, '_', k, '.pkl'), 'w') as f:
                pickle.dump(kmeans_model.clusterCenters(), f)
            # Write k and WSSSE
            elbow_data.write('%i, %f', k, WSSSE)
def main():
    sc = SparkContext()
    filename = sys.argv[1]
    clusters=int(sys.argv[2])
    outmodelname = sys.argv[3]
    dataset = gdal.Open(filename, GA_ReadOnly)
    driver = dataset.GetDriver().ShortName
    x, y, data = tiff_to_array(dataset, weights)
    print "after change to array"
    clusterdata = sc.parallelize(data)
    print "parallelize done"
    kmeanmodel = KMeans.train(clusterdata, clusters, maxIterations=50, runs=10)
    kmeanmodel.save(sc, outmodelname)
    print kmeanmodel.clusterCenters
def train_coarse(sc, split_vecs, V, seed=None):
    """
    Perform KMeans on each split of the data with V clusters each.
    """

    # Cluster first split
    first = split_vecs.map(lambda x: x[0])
    first.cache()
    print 'Total training set size: %d' % first.count()
    print 'Starting training coarse quantizer...'
    C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    first.unpersist()

    # Cluster second split
    second = split_vecs.map(lambda x: x[1])
    second.cache()
    print 'Starting training coarse quantizer...'
    C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    second.unpersist()

    return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters)
Exemple #23
0
def bagofwords(imtrain, imtest=None, features=_features, outdir=None):
    cache = Cache(cacheroot=outdir)

    # Unique labels
    labels = imtrain.map(lambda x: x.category).distinct().collect()
    print labels

    # Features: each returns a row array of features 
    X = imtrain.map(features)  
    
    # Clustering: kmeans clustering to generate words
    # http://spark.apache.org/docs/0.9.0/mllib-guide.html
    model = KMeans.train(X, 2, maxIterations=10, runs=30, initializationMode='random')
    
    # construct bag of words representation
    print model.clusterCenters
Exemple #24
0
def kmeans(k=2):
    """ kmeans """

    # Load and parse training data
    data = getTrainData(dataFilename)
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # pyspark.rdd.PipelinedRDD

    # Build the model (cluster the data)
    #  KMeans.train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||")
    clf = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") # pyspark.mllib.clustering.KMeansModel

    WSSSE = parsedData.map(lambda point: error(point, clf)).reduce(lambda x, y: x + y) # float
    print("Within Set Sum of Squared Error = " + str(WSSSE))
    print "###  cluster centers  ###:"
    print clf.centers
    return clf
Exemple #25
0
def kmeans_CSV():
    try:
	# creating a parsedData RDD to do kmeans on
	servernum = sys.argv[1]
	serverpath = "hdfs://10.0.0.4:8020/opentsdb/" + servernum
	print "Attempting to create SparkContext"
	sconf = SparkConf().setAppName("Kmeans for files")
	print "Sconf set..."
	sc = SparkContext(conf=sconf)
	print "SparkContext created"
	
	# making parsedData RDD: [ array([filewrites, filereads, CPU, diskIOBW, net bytes]), array([...]), ... ]
	# kmeans iteratively passes over data multiple times - cache parsedData
	
	if len(sys.argv) == 2: #user just specified server - do full server kmeans
	    filepaths = get_file_paths(serverpath)# Array of string file paths to all files within folder
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename =  make_name(filepaths) + "_" + servernum
	elif len(sys.argv) == 3: #user put in server and single timeframe - do single file kmeans
	    timeframe = sys.argv[2] #ex: 2014-07-09
	    filepaths = get_singlefile_path(timeframe, serverpath)
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename = str(timeframe) + "_" + servernum
	else: #user put in server and start/end timeframe - do timeframe kmeans
	    start_timeframe = sys.argv[2]
	    end_timeframe = sys.argv[3]
	    filepaths = get_timeframefile_paths(start_timeframe, end_timeframe, serverpath)
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename =  make_name(filepaths)+ "_" + servernum
	k = findk(parsedData.count())
	
	clusters = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random")
	centers = clusters.clusterCenters
	
	# Creating two CSVs (one has data points, one has centers) for later visualization
	compile_CSV(CSV_filename, parsedData)
	compile_centers_CSV(CSV_filename, centers)
	print "SUCCESS: Kmeans done"
    except:
	print "---------------------------------"
	print "Usage: ./bin/spark-submit kmeans_CSV.py <servername> <start_timeframe> <end_timeframe>"
	print "<servername> must be specified. EX: sense0 "
	print "Timeframes are optional. Specify just one timeframe for single file kmeans. Specify start and end for kmeans over timeframe."
	print "Timeframes must be in format yyyy-mm-DD"
	print "---------------------------------"
	raise
Exemple #26
0
    def detect(self, k, t):
        #Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1])
        df1.show()

        #Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        #Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features))
        df2.show()

        #Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2)
        df3.show()

        return df3.where(df3.score > t)
Exemple #27
0
    def kmeans_demo(self):

        file = self.sc.textFile(self.base+'k_data.csv')

        # transform to rdd
        data = file.map(lambda line: line.split(',')).cache()
        print(type(data))

        # train data to get the model
        model = KMeans.train(data,k=3)

        # print to check all clusters
        cluster = model.clusterCenters
        for c in cluster:
            print(c)


        # predict new data  return the data belong to which cluster(index of the cluster)
        predict = model.predict([1.3,.1,1.1])

        print(predict)
Exemple #28
0
    def clustering_score(data,k):
        model = KMeans.train(data, k=k,maxIterations=200)

        def distance(v1, v2):
            s = 0
            # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)]
            pairs = zip(v1,v2)
            for p in pairs:
                sub = float(p[0]) - float(p[1])
                s = s + sub * sub
            return math.sqrt(s)

        def dist_to_centroid(datum):
            # predict the data
            cluster = model.predict(datum)
            # get the current centroid --> means center point
            centroid = model.clusterCenters[cluster]
            # call distance method
            return distance(centroid, datum)

        return data.map(dist_to_centroid).mean()
def run():
    # Set up
    sc = SparkContext()
    records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data')
    # Build clusters
    kvpairs = records.map(keyAndParse)
    cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap()
    kvpairs = kvpairs.reduceByKey(combine)

    # Filter outliers with too few records
    kvpairs = kvpairs.filter(lambda (k,v): cts[k] > 2)
    kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name])))
    
    numClusters = 20
    clusters = KMeans.train(kvpairs.map(lambda (k,v): v),numClusters,10)
    groupedClusters = kvpairs.groupBy(lambda (k,v): clusters.predict(v)).map(lambda x: (x[0], getNames(list(x[1])))).collect()
    # Rank clusters
    centers = avg(clusters.clusterCenters)
    centers.sort(key=lambda x: x['score'], reverse=True)
    # Save sorted clusters
    save(groupedClusters, centers)
Exemple #30
0
    def kmeans_train(self, data_rdd, n_clusters):
        """
        This method is used to train the model
        """

        data_splits = data_rdd.randomSplit([.50, .25, .25], seed=0)
        training_set = data_splits[0].repartition(numPartitions=4).cache()
        validation_set = data_splits[1].repartition(numPartitions=4).cache()
        test_set = data_splits[2].repartition(numPartitions=4).cache()
        max_iter_arr = [50,60,80]
        max_runs =  [50,60,80]
        k_list = n_clusters
        best_model = None
        best_rmse = float("inf")
        best_run = 0
        best_k = 0
        itertools.product
        for itera, run, k in itertools.product(max_iter_arr, max_runs, k_list):
            try:
                model = KMeans.train(training_set, 3, itera, run, "random")
                validation_rmse = model.computeCost(validation_set)
                print("#of clusters k %d\n" % (k))

                if validation_rmse < best_rmse:
                    best_model = model
                    best_rmse = validation_rmse
                    best_run = max_runs
                    best_iter = itera
                    best_k = k
            except Exception as e:
                print(e)
                continue

        # test_preds = best_model.predict(test_set.first())
        print("K-means results...")
        print(str(best_rmse))
        print(str(best_k))

        return best_model
Exemple #31
0
    def anomaly_detection_by_KMeans(self,
                                    columns,
                                    k=3,
                                    threshold=4,
                                    normalize=False):
        '''
        Detect anomaly combination of features through K-Means

        columns: list of columns to be detected
        k: the number of clusters
        threshold:  if (distance - mean_dsitance )> threshold*std, then point is considered to be anomaly
        normalize: whether normalize the data before fitting into the cluster

        Output(in self.out):
            output[0]: Index of outliers (list of int)
            output[1]: DataFrame of outliers
        '''
        def error(point):
            center = clusters.centers[clusters.predict(point)]
            return sqrt(sum([x**2 for x in (point - center)]))

        def addclustercols(x):
            point = np.array(x[1:])
            center = clusters.centers[0]
            mindist = sqrt(sum([y**2 for y in (point - center)]))
            cl = 0
            for i in range(1, len(clusters.centers)):
                center = clusters.centers[i]
                distance = sqrt(sum([y**2 for y in (point - center)]))
                if distance < mindist:
                    cl = i
                    mindist = distance
            clcenter = clusters.centers[cl]
            #return [x[0]]+list(clcenter) + [distance]
            result = list(clcenter) + [distance]
            return [x[0], cl] + [float(x) for x in result]

        def featurize(df, col_name):
            df_stats = df.select(
                F.mean(F.col(col_name)).alias('mean'),
                F.stddev(F.col(col_name)).alias('std')).collect()
            mean = df_stats[0]['mean']
            std = df_stats[0]['std']
            data = df.withColumn(col_name, (df[col_name] - mean) / std)
            data_stats = data.select(
                F.mean(F.col(col_name)).alias('mean'),
                F.stddev(F.col(col_name)).alias('std')).collect()
            new_mean = data_stats[0]['mean']
            new_std = data_stats[0]['std']
            return data

        def featurize_all(df, columns):
            for i in columns:
                df = featurize(df, i)
            data = df
            return data

        data = self.data
        if 'index' not in data.columns:
            print('Please create index first')
            return
        new_cols_len = len(columns)
        number_type = [
            "BinaryType"
            "DecimalType", "DoubleType", "FloatType", "IntegerType",
            "LongType", "ShortType"
        ]
        all_number_type = True
        new_columns_name = ['index', 'cluster_number']
        for column in columns:
            if column in data.columns:
                all_number_type = (str(data.schema[column].dataType)
                                   in number_type) and (all_number_type)
                if not all_number_type:
                    print('The type of' + column + " is " +
                          str(data.schema["_c6"].dataType))
                    print("Only numerical type is accepted ")
                    return
                else:
                    new_columns_name.append(column + "_cluster")
            else:
                print(column, "doesn't exist")
                return
        new_columns_name.append('distance_to_cluster')
        origin_data = data.cache()
        data = data.select(['index'] + columns)
        data = data.dropna()
        if normalize:
            data = featurize_all(data, columns)
        target_numpy = data.select(columns).rdd.map(lambda x: np.array(x))
        clusters = KMeans.train(target_numpy, k, maxIterations=20)
        result_data = data.rdd.map(lambda x: addclustercols(x)).toDF(
            new_columns_name)
        full_data = origin_data.join(result_data, 'index', how='inner')
        stat = full_data.groupBy('cluster_number').agg(
            F.mean('distance_to_cluster').alias('distance_mean'),
            F.stddev('distance_to_cluster').alias('distance_std'))
        anomaly_data = full_data.join(
            stat, 'cluster_number',
            'inner').rdd.filter(lambda x: x['distance_to_cluster'] > (x[
                'distance_mean'] + threshold * x['distance_std']))
        try:
            anomaly_data = anomaly_data.toDF()
            anomaly_indices = anomaly_data.select('index')
        except:
            print("None anomaly data based on your setting")
            return
        else:
            self.out = ([int(i['index'])
                         for i in anomaly_indices.collect()], anomaly_data)
            return
Exemple #32
0
import sys

from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.clustering import KMeans

if __name__ == "__main__":
    dirs = "hdfs:///user/clondo46/datasets/gutenberg"
    k = 5
    maxIters = 20
    sc = SparkContext(appName="Proyecto04")
    #Leemos
    documentos = sc.wholeTextFiles(dirs)
    nombreDocumentos = documentos.keys().collect()
    docs = documentos.values().map(lambda doc: doc.split(" "))
    #Usamos TFIDF
    hashingTF = HashingTF()
    tf = hashingTF.transform(docs)
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    #Crea el modelo de k-mean y crea los clusters
    clusters = KMeans.train(tfidf, k, maxIterations=maxIters)
    clustersid = clusters.predict(tfidf).collect()
    diccionario = dict(zip(nombreDocumentos, clustersid))
    d = sc.parallelize(diccionario.items())
    d.coalesce(1).saveAsTextFile("hdfs:///user/clondo46/gut5")
    sc.stop()  #SparkContext detenido:
Exemple #33
0
from numpy import array
from math import sqrt

from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel

# Load and parse the data
sc = SparkContext()
data = sc.textFile("dataframe.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        initializationMode="random")


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
sameModel = KMeansModel.load(
    sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
Exemple #34
0
rdd1 = sc.textFile("5000_points.txt")

rdd2 = rdd1.map(lambda x: x.split())
rdd3 = rdd2.map(lambda x: [int(x[0]), int(x[1])])

from pyspark.mllib.clustering import KMeans

for clusters in range(1, 30):
    model = KMeans.train(rdd3, clusters)
    print(clusters, model.computeCost(rdd3))

for trials in range(10):  #Try ten times to find best result
    for clusters in range(12, 16):  #Only look in interesting range
        model = KMeans.train(rdd3, clusters)
        cost = model.computeCost(rdd3)
        centers = model.clusterCenters  #Let's grab cluster centers
        if cost < 1e+13:  #If result is good, print it out
            print(clusters, cost)
            for coords in centers:
                print(int(coords[0]), int(coords[1]))
            break
# Reduce returns a single value result contains count of 2xx and 3x against an IP
rawTrainingData = rawTrainingData.reduceByKey(extract_features)
print("training dataset after reduce: ", rawTrainingData.collect())
print('total training lines after reduce by key : ', rawTrainingData.count())

# K-means accepts data in the form of [a, b] its called feature vector. use vector assembler or map function.
# Converts to map of count of 2xx and 3xx
training_dataset = rawTrainingData.map(lambda data: data[1])
print("TRAINING DATASET for Kmean cluster: ", training_dataset.collect())
print('total training lines after reformat : ', rawTrainingData.count())

# set cluster count equals to 2
cluster_count = 2

# train the k-means algo to get the model
trained_model = KMeans.train(training_dataset, cluster_count)

# print the cluster centroids from trained model
for center in range(cluster_count):
    print('centre ', center, trained_model.centers[center])

    # streamingData = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group", {"test" : 1})
    # lines = streamingData.map(lambda x:x[1])

# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

stream_data_init = KafkaUtils.createDirectStream(
    ssc, [topic], {"metadata.broker.list": brokers})
# stream_data_init = KafkaUtils.createStream(
#    ssc,
#    zk,
        exit(-1)

    infilenm = sys.argv[1]  # input file name (in s3)
    k = int(sys.argv[2])  # number of clusters to use
    outfilenm = sys.argv[3]

    # Read the main data file
    lines = sc.textFile(infilenm)

    alldata = lines.map(parse_vector)
    # Only want kmeans run on columns fst:lst
    # For weekend only: .filter(lambda arr: np.array(arr[incols['dow']]) == 0
    #       or np.array(arr[incols['dow']] == 6))
    datasub = alldata.map(lambda arr: np.array(arr[fst:lst])) \
        .filter(lambda x: np.count_nonzero(x) > 0)
    clusters = KMeans.train(datasub, k)
    # For each point: figure out the closest cluster center
    # Add each cluster center as additional columns to the original input
    closestcenter = alldata.map(lambda cc: pt_pred_arr(cc))

    # For M.distance calc: need inverted covariance matrix as part of inputs.
    # So: For each cluster 'c', calculate the covariance matrix.
    inv_covmat = []
    for c in range(0, k):
        # Get the actual data columns (subset of the whole line)
        data = closestcenter.filter(lambda arr: np.array(arr[clstrcol]) == c) \
            .map(lambda arr: np.array(arr[fst:lst]))
        # Calc the covariance matrix, and invert
        # Convert from RDD to list, so numpy stats will run against it
        # OR - could write a function to calc the covariance matrix against this RDD ...
        datacol = data.collect()
Exemple #37
0
@author: lnunno
'''
import numpy as np
from pyspark import SparkContext
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.clustering import KMeans

TOTAL_DOCS = 39944
NUM_CLUSTERS = 20

def parseVector(line):
    _,indices_tuple_ls = line.split('\t')
    indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list.
    return SparseVector(TOTAL_DOCS,indices_tuple_ls)
    
if __name__ == '__main__':
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile('../../data/spark_tf_idf_vectors.tsv')
    data = lines.map(parseVector)
    np.set_printoptions(threshold='nan')
    n = NUM_CLUSTERS
    while n >= 2:
        model = KMeans.train(data,n)
        centers = model.clusterCenters
        with open('../../data/clusters_%d.txt' % (n),'w') as f:
            for c in centers:
                # Format in exponential notation.
                s = ','.join([('%e' % x) for x in c])
                f.write('%s\n'% (s))
        n -= 2
Exemple #38
0
#let's generate random class data, add in a cluster center to random 2D points

#use default num of partitions, or use a definte number to make it so that the union
#  will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))

#concatenate 2 RDDs with  .union(other) function
c12    =c1_v.union(c2_v)
my_data=c12.union(c3_v)   #this now has all points, as RDD


my_kmmodel = KMeans.train(my_data,k=1,
               maxIterations=20,runs=1,
               initializationMode='k-means||',seed=10L)

#try: help(KMeans.train)  to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.
#initializationMode specifies either random initialization or initialization via k-means||.
#runs is the number of times to run the k-means algorithm (k-means is not guaranteed to find a globally optimal solution, and when run multiple times on a given dataset, the algorithm returns the best clustering result).
#initializationSteps determines the number of steps in the k-means|| algorithm.
#epsilon determines the distance threshold within which we consider k-means to have converged.
 

#type dir(my_kmmodel) to see functions available on the cluster results object

#The computeCost function might not be available on your cloudera vm,
#  spark mlllib, it computes the Sum Squared Error: my_kmmodel.computeCost(my_data)  
Exemple #39
0
'''
USES SPARK MLLIB LIBRARY TO RUN KMEANS IN A HADOOP CLUSTER
AUTOMATICALLY USES ALL RESOURCES AVAILABLE ACROSS NODES
'''

from pyspark.mllib.clustering import KMeans
from numpy import array
import time

luteo_data = sc.textFile('/final_project/luteo_clean.csv')
parsed_data = luteo_data.map(
    lambda line: array([float(x) for x in line.split(',')])).cache()

with open('/usr/local/kmeans_spark_times.txt', 'w') as out_file:
    for n_clusters in range(1, 30):
        start_time = time.time()
        clusters = KMeans.train(parsed_data, n_clusters, maxIterations=100)
        end_time = time.time()
        out_file.write('{0} {1}\n'.format(n_clusters, end_time - start_time))
        print('{0} {1}\n'.format(n_clusters, end_time - start_time))
    data = sc.textFile("s3://ccdatauvamsds2017/YearPredictionMSD.txt")
    #sc.addFile("YearPredictionMSD.txt")
    #sc.addFile("YearPredictionMSD")

    #data = SparkFiles.get('YearPredictionMSD.txt')
    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(',')]))

    #K-Means Code
    #Sampling with replacement
    data_sample = parsedData.sample(True, 100, 1234)

    start_time = timeit.default_timer()

    clusters = KMeans.train(data_sample,
                            2,
                            maxIterations=10,
                            initializationMode="random")

    time_Kmeans = (timeit.default_timer() - start_time)
    print(timeit.default_timer() - start_time)
    time_Kmeans_string = str(time_Kmeans)
    file = open("s3://ccdatauvamsds2017/output/Time_logs.txt", "w")
    file.write(time_Kmeans_string)
    file.close()

    #Random Forest Code
    data = MLUtils.loadLibSVMFile(
        sc, sc.textFile("s3://ccdatauvamsds2017/YearPredictionMSD"))

    start_time = timeit.default_timer()
cv_error_storage = []

for w in range(num_folds):
    #new train/validation split
    train = data[0:i] + data[j:]
    val = data[i:j]
    train = sc.parallelize(train)
    val = sc.parallelize(val)
    minError = float("inf")
    bestModel = None
    bestK = None
    test_values = [80, 90, 100, 110, 120, 130, 140]
    #test_values = [120]
    error_storage = []
    for x in test_values:
        model = KMeans.train(train.values(), x, maxIterations=10, runs=10, epsilon=.00001)
        error = model.computeCost(val.values())
        error_storage.append(error)
        print "******     model with " + str(x) + " clusters done in validation fold " + str(w+1) + " ***********"
        print "with error: " + str(error)
        if error < minError:
            bestModel = model
            minError = error
            bestK = x
    cv_error_storage.append(error_storage)
    i = i + partitionSize
    j = j + partitionSize


#get CVerrors (mean of the errors from the 10 cross validated samples)
CVerrors = []
Exemple #42
0
sc = SparkContext(appName="kmeans")


def myVec(line):
    from pyspark.mllib.linalg import SparseVector
    return eval("SparseVector" + line)


# Load and parse the data
data = sc.textFile(fname).map(myVec)

# Build the model (cluster the data)
clusters = KMeans.train(data,
                        k,
                        maxIterations=max_iter,
                        runs=runs,
                        initializationMode="random")

# # Evaluate clustering by computing Within Set Sum of Squared Errors
# def error(point):
#     center = clusters.centers[clusters.predict(point)]
#     return sqrt(sum([x**2 for x in (point - center)]))

#WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
# print("Within Set Sum of Squared Error = " + str(WSSSE))

f = open(args.output, "w")
for c in clusters.clusterCenters:
    f.write("[")
    for i in range(len(c)):
Exemple #43
0
data = sc.textFile(input_data)
parsedData = data.map(
    lambda line: np.array([float(x) for x in line.split('\t')[2:]])).cache()

print("\n number of keys is {} \n".format(parsedData.count()))


for K in [2,5,10,50,100,150,200,250,300,350,400,450,500,600,700,800,900] + \
                                             list(range(1000,10001,500)):

    ts = time()
    print("\n start to train model, K = {} \n".format(K))
    model = KMeans.train(parsedData,
                         K,
                         maxIterations=max_iteration,
                         initializationMode="k-means||",
                         initializationSteps=2,
                         epsilon=1e-6)

    def error(point):
        center = model.centers[model.predict(point)]
        return sum([x**2 for x in (point - center)])

    WSSSE = parsedData.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("\nK = {};  WSSSE = {}; elapsed time = {} minutes \n".format(
        K, WSSSE, (time() - ts) / 60))
    model.save(sc, join(output, str(K), 'model'))
    #sameModel = KMeansModel.load(sc, join(output, 'model'))
    def calKGroup(self, high, sc):
        #conf = SparkConf().setAppName("SparkSQLKmeans")
        #sc = SparkContext()
        sqlsc = SQLContext(sc)
        MYSQL_USERNAME = ""
        MYSQL_PWD = ""
        #Original URL
        MYSQL_CONNECTION_URL = "jdbc:mysql://1.0.0.127:3306/telegramdb?autoReconnect=true&useSSL=false&user="******"&password="******"jdbc").options(
            url=MYSQL_CONNECTION_URL,
            dbtable="information",
            driver="com.mysql.jdbc.Driver").load()
        tag_df = sqlsc.read.format("jdbc").options(
            url=MYSQL_CONNECTION_URL,
            dbtable="tags",
            driver="com.mysql.jdbc.Driver").load()
        col_num = tag_df.filter(tag_df.high == high).count()
        tags = tag_df.filter(
            tag_df.high == high).map(lambda list: list.low).collect()
        cols = {}
        for tag in tags:
            cols[tag] = 0
            #print(tag)
        print(cols)
        #results = info.map(lambda line: array([x[1:-1].replace("{", "").replace("}","") for x in line.low.split(",")])).collect()
        #for temp in results:
        #    print(temp)
        pks = info_df.filter(
            info_df.high == high).map(lambda line: line.PK_aid).collect()
        repos = info_df.filter(info_df.high == high).map(lambda line: {line.PK_aid:json.loads(line.low, \
                                            encoding="utf-8")}).collect()
        rows = info_df.filter(info_df.high == high).map(
            lambda line: {
                line.PK_aid: np.zeros(col_num, dtype=np.int)
            }).collect()
        row_num = info_df.filter(info_df.high == high).count()
        #print(row_num)
        print(row_num)
        print(col_num)
        #print(data)
        #print(rows)
        for index, repo in enumerate(repos):
            #print("[%d] : "%(index)+str(temps))
            for temp in repo:
                print("[%d] : " % (index) + str(repo.get(temp)))
                for element in repo.get(temp):
                    t = element.items()
                    print("->" + str(t) + ", ")
                    #+ str(element.get(element)))
            #for temp in cols:
            #print("[%d] : "%(index)+str(temp))
        check = {}
        key = ""
        for index, repo in enumerate(repos):
            for pk_aids in repo:
                elements = repo.get(pk_aids)
                for element in elements:
                    for col_index, col in enumerate(cols):
                        if element.get(col) is not None:
                            rows[index].get(
                                pk_aids)[col_index] = element.get(col) + 3
                            key = str(element.keys()).strip().replace(
                                "dict_keys([\'", "").replace("\'])", "")
                            if key in check:
                                check[key] += element.get(col)
                            else:
                                check[key] = element.get(col)
                        else:
                            rows[index].get(
                                pk_aids)[col_index] = random.randrange(2)
        for index, row in enumerate(rows):
            for pk_aids in row:
                if rows[index].get(pk_aids) is not None:
                    #print(rows[index].get(pk_aids))
                    if index == 0:
                        data = rows[index].get(pk_aids)
                    else:
                        data = np.append(data, rows[index].get(pk_aids))

        print(str(np.resize(data, (row_num, col_num)).shape))
        #data = np.resize(data, (row_num, col_num)
        clusterdata_1 = sc.parallelize(np.resize(data, (row_num, col_num)))
        model = KMeans.train(clusterdata_1, 10, maxIterations=100, runs=30, \
                             initializationMode="random", seed=10, initializationSteps=10, epsilon=1e-4)
        #model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.9, maxIterations=100, seed=10)
        #for i in range(3):
        #    print ("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
        #        "sigma = ", model.gaussians[i].sigma.toArray())
        labels = model.predict(clusterdata_1).collect()
        temps = []
        for pk in pks:
            temps.append([pk, 0])
        print(labels)
        count = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        for index, label in enumerate(labels):
            temps[index][1] = label + 1
            count[label] += 1
        for index, c in enumerate(count):
            print("Group %d : %d" % (index + 1, c))

        for col in cols.keys():
            for ch in check.keys():
                #print("col : %s, ch : %s"%(col, ch))
                if col == ch:
                    cols[col] = check[ch]
        zeros = ""
        zero_count = 0
        nozeros = ""
        nozero_count = 0
        for col in cols.keys():
            if cols[col] == 0:
                zeros += col + ", "
                zero_count += 1
            else:
                nozeros += col + ", "
                nozero_count += 1
        print("발견된 태그들[%d 개] : %s" % (nozero_count, nozeros[:-2]))
        print("")
        print("발견 안된 태그들[%d 개] : %s" % (zero_count, zeros[:-2]))

        return temps
Exemple #45
0
inputNum_min = float(finalDF1.select(min('inputNum').alias('min_inputNum')).collect()[0]['min_inputNum'])
inputNum_max = float(finalDF1.select(max('inputNum').alias('max_inputNum')).collect()[0]['max_inputNum'])
Min_v = inputNum_min
Max_v = inputNum_max
Norm_inputNum_function = udf(lambda v: (float(v) - Min_v) / (Max_v - Min_v), DoubleType())
finalDF2 = finalDF2.withColumn('Norm_inputNum', Norm_inputNum_function(finalDF2.inputNum))


%pyspark
#conduct OneHotEncoder for project_index column
from pyspark.ml.feature import OneHotEncoder

finalDF2.registerTempTable("dfData")
finalDF2 = spark.sql("SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData")

encoder = OneHotEncoder(dropLast=False, inputCol="project_index", outputCol="project_Vec")
encoded = encoder.transform(finalDF2)

%pyspark
encoded.registerTempTable("dfData")
finalDF3 = spark.sql("SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData")

from pyspark.mllib.linalg import SparseVector
import numpy as np
#824 should be revised according to your onehotcode result
RDD = finalDF3.rdd.map(lambda line: SparseVector(824, line["project_Vec"].indices.tolist() + [821, 822, 823], line["project_Vec"].values.tolist() + [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache()

%pyspark
from pyspark.mllib.clustering import KMeans
clusters = KMeans.train(RDD, 2, maxIterations=10, runs=10, initializationMode="k-means||")
Exemple #46
0
    # then we calculate word count and get (docId, count) pair
    docCountTotal = allWords.map(lambda x: (x[1], 1)).reduceByKey(add)
    # we should get (docID, (dicPos, occur), docCount) pair and get (docId, (dicPos, Freq)) pair
    docWithCount = wordsInDocSorted.join(docCountTotal).flatMap(lambda x: ((x[0], j, x[1][1]) for j in x[1][0]))
    # print(docWithCount.take(1))
    docWithFreq = docWithCount.map(lambda x: [x[0], (x[1][0], float(x[1][1]) / float(x[2]))]).groupByKey().map(
        lambda x: (x[0], sorted(x[1])))
    # get the Feature vector for each doc
    docWithFreqVect = docWithFreq.map(lambda x: (x[0], featureVec(x[1])))
    # print(docWithFreqVect.take(1))
    # task2
    # using k-means to cluster these data points
    parsedData = docWithFreqVect.map(lambda x: x[1])

    # Build the model (cluster the data)
    model = KMeans.train(parsedData, 3, maxIterations=10, initializationMode="random")

    # get class and frequent vector
    regenum = re.compile('[^0-9]')
    # keyWithClass = keyAndText.map(lambda x: (x[1], x[0]))
    # classWithFreq = keyWithClass.join(docWithFreqVect).map(lambda x: x[1]).map(lambda x: (regenum.sub('', x[0]), x[1]))

    # testing
    # testLines = sc.textFile("testdata.csv")
    testLines = sc.textFile(sys.argv[2], 1)
    # filter data and transfer data into into (docId, txt) pair
    testValidLines = testLines.map(lambda x: x.split(',')).filter(lambda p: len(p) == 6)
    testKeyAndText = testValidLines.map(lambda x: (x[0], x[1], x[5]))
    # use regular expression to transfer data text into list of words
    # remove all non letter words
    testKeyAndListOfWords = testKeyAndText.map(lambda x: (x[1], regex.sub(' ', x[2]).lower().split()))
    i+=1
  return result

#take seed data and convert to double
trainingRaw = sc.textFile("/FileStore/tables/ghwlpxtt1499907037815/seeds.txt")
trainingData = trainingRaw.map(lambda x: x.split('\t')).map(todouble)

trainingData.collect()


# COMMAND ----------

maxClus = [2,5,7,10,20,40,60,100,200,400]
least = sys.maxint 
c = 0
for val in maxClus:
  clusters = KMeans.train(trainingData, val, maxIterations=20, initializationMode="random")

  WSSSE = clusters.computeCost(trainingData)
  if(least > WSSSE):
    least = WSSSE
    c = val
  
  print("Within Set Sum of Squared Error for "+ str(val) +" is " + str(WSSSE))

print("least WSSSE is " + str(least) + " cluster size of " + str(c))

# COMMAND ----------


Exemple #48
0
results_list2 = len(results[1][1])
results_list3 = len(results[2][1])

min_len = min(result_list1, results_list2, results_list3)

results_list = [[], [], []]
results_list[0] = list(results[0][1])[:(min_len - 1)]
results_list[1] = list(results[1][1])[:(min_len - 1)]
results_list[2] = list(results[2][1])[:(min_len - 1)]

datenow = (results[0][0].split(','))[1]

mat = sc.parallelize(np.column_stack(results_list))
#summary = Statistics.colStats(mat)
#print(summary.mean())
clusters = KMeans.train(mat, 3, maxIterations=10)
transformeds = clusters.predict(mat).collect()

first_group, second_group, third_group = 0, 0, 0
for transformed in transformeds:
    predict_value = int(transformed)
    if (predict_value == 0):
        first_group = first_group + 1
    if (predict_value == 1):
        second_group = second_group + 1
    if (predict_value == 2):
        third_group = third_group + 1

cluster_nums = [first_group, first_group, third_group]

print("{} length of trans".format(len(transformeds)))
if __name__ == "__main__":
    sparkConf = SparkConf()
    sparkContext = SparkContext(conf=sparkConf)

    data = sparkContext\
        .textFile("data/clusteringData.txt")

    parsed_data = data\
        .map(lambda line: [float(x) for x in line.split(' ')])\
        .cache()

    number_of_clusters = 4
    number_of_iterations = 20

    clusters = KMeans.train(parsed_data,
                            number_of_clusters,
                            number_of_iterations,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsed_data.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    centers = clusters.clusterCenters
    print("Cluster Centers: ")
    for center in centers:
        print(center)

conf = SparkConf()
conf.set("spark.master", "local")
sc = SparkContext(conf=conf)

data = sc.textFile("practice6_train.csv")
trData = data.map(parseFeat)

data = sc.textFile("practice6_test.csv")
tsData = data.map(parseFeat)
tsLabel = data.map(parseLabel)

kmeans_list = []
for i in range(30):
    kmeans_list.append(KMeans.train(trData, k=10, maxIterations=100, seed=i))

obj_list = []
for i in range(30):
    obj_list.append(
        trData.map(lambda point: error(point, kmeans_list[i])).reduce(
            lambda x, y: x + y))

kmeans = kmeans_list[obj_list.index(min(obj_list))]
tsPredict = kmeans.predict(tsData)

nmi_score = NMI(list(tsPredict.collect()), list(tsLabel.collect()))

f = open('result.txt', 'w')
f.write('NMI of K-Means clustering\n')
f.write('{:.4f}'.format(nmi_score))
def buildModel():
    model = KMeans.train(features,
                         3,
                         maxIterations=5,
                         initializationMode="random")
    return model
Exemple #52
0
parser.add_argument('--iterations', help='number of iterations in each training run (default=32)', type=int, default=32)
parser.add_argument('--runs', help='number of training runs (default=10)', type=int, default=10)
parser.add_argument('--clusters', help='number of cluster centers to find (default=128)', type=int, default=128)
parser.add_argument('--config', metavar="KEY=VAL", help="add KEY=VAL to Spark's configuration", action='append', default=[], dest='config')


if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
    protospark = SparkSession.builder.appName("k-means-app").master(args.master)
    spark = reduce(lambda x, y: x.config(*y.split("=")), args.config, protospark).getOrCreate()
    runs = args.runs
    iterations = args.iterations
    partitions = args.partitions
    clusters = args.clusters

    sc = spark.sparkContext
    rdd = sc.textFile(args.infile).map(lambda line: fromstring(line, sep=",")).repartition(partitions)

    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )

    start_time = clock()
    for run in (range(runs)):
        KMeans.train(rdd, clusters, iterations)
    end_time = clock()

    sc.stop()
    
    print("completed %d run%s in %f seconds" % (runs, (runs > 1 and "s" or ""), end_time - start_time))
Exemple #53
0
import sys

import numpy as np
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans


def parseVector(line):
    vector = eval(line[1])
    return np.array([vector.get(x, 0.0) for x in range(max(vector) + 1)])


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print >> sys.stderr, "Usage: kmeans <file> <k> <max_iteration>"
        exit(-1)
    sc = SparkContext(appName="PythonKMeans")
    lines = sc.sequenceFile(
        sys.argv[1],
        "org.apache.hadoop.io.LongWritable",
        "org.apache.mahout.math.VectorWritable",
        valueConverter=
        "com.intel.sparkbench.datagen.pythonconverter.MahoutVectorToStringConverter"
    )
    data = lines.map(parseVector)
    k = int(sys.argv[2])
    max_iterations = int(sys.argv[3])
    model = KMeans.train(data, k, max_iterations)
    print "Final centers: " + str(model.clusterCenters)
Exemple #54
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.clustering import KMeans

import numpy as np
from operator import add

conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf = conf)

data = np.array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)

model = KMeans.train(sc.parallelize(data),
                     2,
                     maxIterations=10,
                     runs=30,
                     initializationMode="random",
                     seed=50,
                     initializationSteps=5,
                     epsilon=1e-4)

#silhouette score spark
labels = model.predict(sc.parallelize(data))
with_labels = sc.parallelize(data).zip(labels)
errors_in_clusters = with_labels.map(lambda (x, cluster_index): (cluster_index, (x - model.clusterCenters[cluster_index])**2))

cluster_counts = with_labels.map(lambda (x,y): (y,x)).countByValue()
errors_in_clusters.reduceByKey(add)
zipByKey
divide, done

mse_array = with_cluster_centres.map(lambda (x, y): (x-y)**2).reduce(add)
Exemple #55
0
    X = []
    for i in range (k):
        incomeCentroid = random.uniform(20000.0, 200000.0)
        ageCentroid = random.uniform(20.0, 70.0)
        for j in range(int(pointsPerCluster)):
            X.append([random.normal(incomeCentroid, 10000.0), random.normal(ageCentroid, 2.0)])
    X = array(X)
    return X

random.seed(0)

# Load the data; note I am normalizing it with scale() - very important!
data = sc.parallelize(scale(createClusteredData(100, K)))

# Build the model (cluster the data)
clusters = KMeans.train(data, K, maxIterations=10,
        runs=10, initializationMode="random")

# Print out the cluster assignments
resultRDD = data.map(lambda point: clusters.predict(point)).cache()

print("Counts by value:")
counts = resultRDD.countByValue()
print(counts)

print("Cluster assignments:")
results = resultRDD.collect()
print(results)


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
idf = IDF(minDocFreq=1).fit(tf)

# In[83]:

tfidf = idf.transform(tf)

# In[84]:

# tfidf.collect()

# In[85]:

if algorithm == "K":
    clusters = KMeans.train(tfidf,
                            8,
                            maxIterations=20,
                            initializationMode="random",
                            seed=42)
else:
    clusters = BisectingKMeans.train(tfidf, 8, maxIterations=20, seed=42)
    clusterCenters = clusters.clusterCenters

# In[ ]:

# In[86]:

documentModel = documents1.zip(tfidf)
# cluster_broadcast = sc.broadcast(clusters)

# In[87]:
    max_n_clusters = args.Max_n_components
    filname = "likelihood_" + str(max_n_clusters)
    if (os.path.exists(filname)):
        os.remove(filname)
    print("Data being unloaded in numpy array")
    print("data unloaded")
    myfile = open(filname, "a")
    z = 10
    while z * 100 < max_n_clusters + 1:
        n_clusters = z * 100
        print(n_clusters)
        print("THE VALUE OF NUMBER OF CLUSTERS IS ABOVE")
        model = KMeans.train(data,
                             n_clusters,
                             initializationMode="k-means||",
                             seed=50,
                             initializationSteps=5,
                             epsilon=1e-3,
                             maxIterations=10000)
        wssse = model.computeCost(data)
        print("Within Set Sum of Squared Errors = " + str(wssse))

        #  Shows the result.
        centers = model.clusterCenters
        # print("Cluster Centers: ")
        # for center in centers:
        # print(center)
        #     responsibility_matrix, cluster_labels, loglikelihood, cluster_probability = GMMModel.resultPredict(
        #         model, data)

        #     responsibility_matrix_a = responsibility_matrix.take(data_size)
Exemple #58
0
import numpy as np
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel

sc = SparkContext("local", "My Simple App")

data = sc.textFile("/home/macuser/train.csv")

# skip header
data = data.filter(lambda line: line[0] != 'l')

parsed = data.map(
    lambda line: np.array([float(x) for x in line.split(',')[1:]]))

clusters = KMeans.train(parsed,
                        10,
                        maxIterations=10,
                        runs=1,
                        initializationMode="random")


def error(pt):
    center = clusters.centers[clusters.predict(pt)]
    return np.sqrt(sum([x**2 for x in (pt - center)]))


wssse = parsed.map(lambda pt: error(pt)).reduce(lambda x, y: x + y)
print("Within set sum of squared error: %s" % wssse)
Exemple #59
0
    .builder \
    .master("local") \
    .appName("Python Spark SQL basic example") \
    .config("spark.default.parallelism","80") \
    .config("spark.driver.memory","8g") \
    .config("spark.executor.memory","8g") \
    .config("spark.speculation","true") \
    .config("spark.local.dir","/opt/tmp") \
    .getOrCreate()

sc = spark.sparkContext

#data = sc.textFile("file:///opt/workspace/tgtag0528.csv")
df = spark.read.csv("file:///opt/workspace/tgtag0528.csv")
df3 = df.select("_c1", "_c4", "_c8").dropna().rdd.map(lambda x: array(list(x)))
clusters = KMeans.train(df3, 3, maxIterations=10, initializationMode="random")

print clusters.clusterCenters
# Evaluate clustering by computing Within Set Sum of Squared Errors
#def error(point):
#   center = clusters.centers[clusters.predict(point)]
#    return sqrt(sum([x**2 for x in (point - center)]))

#WSSSE = df3.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(clusters.computeCost(df3)))

k = 100
for i in range(2, k):
    clusters = KMeans.train(df3, i, maxIterations=100)
    print("%d class:Within Set Sum of Squared Error = " % (i) +
          str(clusters.computeCost(df3)))
Exemple #60
0
import numpy as np
from pyspark.mllib.clustering import KMeans

images = td.images.frombinary('/user/ds/neuro/fish-long', order='F', engine=sc)
series = images.toseries()

normalized = series.normalize(method='mean')
stddevs = (normalized.map(lambda s: s.std()).sample(1000))
plt.hist(stddevs.values, bins=20)
plt.plot(normalized.filter(lambda s: s.std() >= 0.1).sample(50).values.T)

# perform k-means on the normalized series
ks = [5, 10, 15, 20, 30, 50, 100, 200]
models = []
for k in ks:
    models.append(KMeans.train(normalized.values._rdd.values(), k))


# define a couple functions to score the clustering quality
def model_error_1(model):
    def series_error(series):
        cluster_id = model.predict(series)
        center = model.centers[cluster_id]
        diff = center - series
        return diff.dot(diff)**0.5

    return (normalized.map(series_error).toarray().sum())


def model_error_2(model):
    return model.computeCost(normalized.values._rdd.values())