Exemple #1
0
    def perform_training(sc: SparkContext, params_dict: dict):
        normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[
            'normal_ekg_data_path']
        min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters'])
        max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int(
            params_dict['max_num_of_clusters'])
        boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio'])

        ekg_rdd_data = sc.textFile(normal_ekg_data_path).map(
            lambda line: np.array([float(val) for val in line.split(',')]))

        # ekg_rdd_data.foreach(Plotter.plot_signal_window)
        k_range = range(min_num_of_clusters, max_num_of_clusters, 1)
        prev_cost = float(np.inf)
        final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1))
        cost_ratios = []
        found_best = False
        for k in k_range:
            km = KMeans.train(ekg_rdd_data, k)
            # cost equals to sum of squared distances of samples to the nearest cluster centre
            cost = km.computeCost(ekg_rdd_data)
            ratio = cost / prev_cost
            prev_cost = cost
            cost_ratios.append(ratio)
            if (ratio > boundary_ratio) & (not found_best):
                final_km = km
                found_best = True

        Plotter.plot_elbow(cost_ratios, k_range)
        return final_km
Exemple #2
0
def kmeans():
    """
    使用mllib对Spark安装包mllib的测试数据集做K-means聚类,由于train方法:
        Training points as an `RDD` of `Vector` or convertible
    所以需对数据集格式化:
        初始数据集 --> ['0.0 0.0 0.0', '0.1 0.1 0.1', '0.2 0.2 0.2']
        格式化后数据集 --> [array([0., 0., 0.]), array([0.1, 0.1, 0.1]), array([0.2, 0.2, 0.2])]
    :return:
    """
    data_rdd = sc.textFile('{}/mllib/kmeans_data.txt'.format(current_dir))
    parsed_data_rdd = data_rdd.map(lambda line: array([float(x) for x in line.split(' ')]))

    # 建立聚类模型
    clusters = KMeans.train(parsed_data_rdd, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x ** 2 for x in (point - center)]))

    WSSSE = parsed_data_rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # 保存 训练好的模型
    model_path = "{}/kmeans_model".format(current_dir)
    if not os.path.exists(model_path):
        clusters.save(sc, model_path)

    trained_model = KMeansModel.load(
        sc, "{}/kmeans_model".format(current_dir)
    )
    return trained_model
Exemple #3
0
def logo_feature_cluster(train_feature_list, train_name_list, clusternum):
    '''训练'''
    model = KMeans.train(sc.parallelize(train_feature_list),
                         clusternum,
                         maxIterations=10,
                         initializationMode="random",
                         seed=50,
                         initializationSteps=5,
                         epsilon=1e-4)
    model_path = tempfile.mkdtemp()
    model.save(sc, model_path)
    model = KMeansModel.load(sc, model_path)
    '''预测'''
    predict = model.predict(sc.parallelize(train_feature_list))
    # print(predict.collect())
    try:
        rmtree(model_path)
    except OSError:
        pass

    logo_result_path = os.path.join(
        result_path, "logo_image_result" + str(clusternum) + ".txt")
    writeResultTofile(logo_result_path, train_name_list, predict.collect())
    '''Calinski-Harabasz聚类评估指标'''
    # evaluationCH = metrics.calinski_harabaz_score(train_feature_list, predict.collect())
    # ch = str(round(evaluationCH, 2))
    # print("Calinski-Harabasz聚类评估指标:"+ch)
    # with open(result_path + "Calinski-Harabasz.txt", 'a') as a:
    #     a.write(str(clusternum)+":" + ch + "\n")
    '''Silhouette-Coefficient聚类评估指标'''
def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel
def kmeans_w2v_predict():
    # appName='kmeans_w2v_predict'
    # sc = SparkContext(appName=appName)
    # from pyspark.sql import SQLContext
    from pyspark.mllib.clustering import KMeans, KMeansModel
    # sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lambert_w2v_data_jan")
    # df = data.toDF("text", "filtered_text", "split_text", "vectors", "id")
    df = data.toDF("tokens", "vectors", "id")
    df = df.where(df.vectors.isNotNull())
    data = df.rdd
    model = KMeansModel.load(sc,
                             "hdfs:///user/rmusters/lambert_kmeans_w2v_jan")

    # data = data.map(lambda (text, filtered_text, split_text, vectors, id): (text, filtered_text, split_text, vectors, model.predict(vectors), id))
    # df = data.toDF(["text", "filtered_text", "split_text", "vectors", "cluster", "id"])
    data = data.map(lambda (tokens, vectors, id):
                    (tokens, vectors, model.predict(vectors), id))
    df = data.toDF(["tokens", "vectors", "cluster", "id"])
    df = df.select("cluster", "id")
    df = df.sort(df.cluster.asc())
    df.write.format("com.databricks.spark.csv").mode("overwrite").save(
        "lambert_w2v_data_cluster.csv")
    # df.save("hdfs:///user/rmusters/lambert_w2v_data_cluster.csv", "com.databricks.spark.csv")
    df.write.parquet("hdfs:///user/rmusters/lambert_w2v_data_cluster",
                     mode="overwrite")
Exemple #6
0
def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model
def getCluster(price, crime, male, female, white, black, asian, hispanic,
               young, mid_age, senior):
    KModel = KMeansModel.load(sc, "project/data/output/KMeansModel")
    cluster = KModel.predict([
        price, crime, male, female, white, black, asian, hispanic, young,
        mid_age, senior
    ])
    return cluster
Exemple #8
0
 def KMeans_Processing(self, columns):
     data_point = np.array(self.df_PD[columns])
     model = KMeansModel.load(
         self.sc, self.baseDir + '/fraudModel/Model/' + 'KMeans')
     result = np.array(
         model.predict(self.sc.parallelize(data_point)).collect())
     self.df_PD.insert(len(list(self.df_PD.columns)), 'KMeans_feature',
                       result)
Exemple #9
0
def assign_pooling(data):

    image_name, feature_matrix = data[0]
    clusterCenters = data[1]

    feature_matrix = np.array(feature_matrix)

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        bow[k] = max(bow[k], dist)

    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
def assign_pooling(row, clusterCenters, pooling):
    image_name = row['fileName']
    feature_matrix = np.array(row['features'])
    clusterCenters = clusterCenters.value
    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    #print(image_name + " in group: " + str(group))
    return [(image_name, group)]
Exemple #11
0
def main():
    sc = SparkContext(appName="tileMapper")
    print("I do all the input output jazz")

    ###########################################################################
    big_image = sc.binaryFiles("Reference/108103_sm.jpg")
    tile_avgs = big_image.flatMap(extract_opencv_tiles())
    #buckets = tile_avgs.collect()
    #print("Bucket",buckets)
    tileMap = tile_avgs.map(
        lambda l: [item for sublist in l for item in sublist])
    tileList = tileMap.collect()
    print("Tile Map", tileMap)
    print("Tile Map", tileMap.collect())
    print("Tile List", tileList)
    print("Tile LIst", type(tileList))
    ############################################################################

    clusterIndex = getIndex()
    kmModel = KMeansModel.load(sc, "myModelPath")
    readyToCombine = []
    currentRow = None
    noOfRow = 0
    noOfCol = 0
    firstTile = tileList[0]
    tileSize = firstTile[1]
    #Randomly Get small images using kmeans match
    for tile in tileList:
        if tile[0] == currentRow:
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
            noOfCol = noOfCol + 1
        else:
            currentRow = tile[0]
            noOfCol = 1
            noOfRow = noOfRow + 1
            currentRow = tile[0]
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
    #Put small images into the big image canvas

    canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8)

    #Print Image
    print("No. of Col", noOfCol)
    print("No. of Row", noOfRow)
    #print("Before Print, Check Once again",readyToCombine)
    mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow,
                             tileSize)

    print("Finished processing of image")
    cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)
Exemple #12
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    centers = sameModel.clusterCenters
    print("Cluster Centers: ")
    for n, center in enumerate(centers):
        out_f = OUTPUT_DATA + str(n) + "Cluster.csv"
        numpy.savetxt(out_f, center, newline=";")
        print(center)
def main():
    modelname = sys.argv[1]
    tiffname = sys.argv[2]
    outputname = sys.argv[3]
    sc = SparkContext()
    model = KMeansModel.load(sc, modelname)
    dataset = gdal.Open(tiffname, GA_ReadOnly)
    x, y, data = train.tiff_to_array(dataset, train.weights)
    driver = dataset.GetDriver().ShortName
    clusterdata = sc.parallelize(data)
    result = np.array(clusterdata.map(lambda point: model.predict(point)).collect())
    write_to_tif(outputname, x, y, result, driver)
def assign_pooling(data):

    row = data[0]
    clusterCenters = data[1]
    pooling = data[2]

    image_name = row['fileName']
    feature_matrix = np.array(row['features'])

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
Exemple #15
0
def kmeans_classification(sc, c_tag, util):
    print 'data retrieving'
    _data_ = data_retriever(c_tag)
    print len(_data_)
    #rids = [x[0] for x in _data_]

    blc = nlpb.nlpblockbase()

    __ans = [train_feature_extraction(x, c_tag, blc, util) for x in _data_]
    ans = [[float(k) for k in x] for x in __ans]
    #print ans
    train_data = [np.array(sf.softmax(x)) for x in ans]
    #train_data = ans
    print 'dataprep done'
    assert_len(train_data)
    brotrain = sc.broadcast(train_data)
    clusters = KMeans.train(sc.parallelize(brotrain.value),
                            200,
                            maxIterations=10,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = map(lambda point: error(point), train_data)
    WSSSE = reduce(lambda x, y: x + y, WSSSE)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    clustered = collections.defaultdict(list)
    print len(_data_)
    cnt = 0
    for row in _data_:
        clustered[clusters.predict(train_data[cnt])].append(
            [row[0], row[2], row[3], row[4]])
        cnt += 1
    print len(clustered)
    for k in clustered.keys():
        print len(clustered[k])

    clusters.save(sc, "MovieModel")
    sameModel = KMeansModel.load(sc, "MovieModel")

    ref = collections.defaultdict(list)
    for point in train_data:
        ref[sameModel.predict(point)].append(point)

    for x, y in zip(ref.keys(), clustered.keys()):
        assert len(ref[x]) == len(clustered[y])

    return clustered
Exemple #16
0
def kmeansInitialClusters(dataset):
    model = KMeansModel(CENTER_VECTORS)
    vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features'])))
    trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model)
    result=[]
    for d in dataset.collect():
        entry = {}
        entry["features"] = d["features"]
        entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features'])))
        entry["label"] = d['label']
        result.append(entry)

    plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters")
    centroidArtistSongCount(result, CENTERS)
Exemple #17
0
def main(sc):
    data = [[1.0, 1.0], [1.0, 0.8], [-1.0, 1.0], [-1.0, -1.0]]
    parsedData = sc.parallelize(data)
    kmeansModel = KMeans.train(parsedData,
                               2,
                               maxIterations=10,
                               runs=10,
                               initializationMode="random")
    print(kmeansModel.predict([1.0, 1.0]))
    print(kmeansModel.predict([1.0, -2.0]))
    # Save and load model
    kmeansModel.save(sc, "KMeansModel")
    model = KMeansModel.load(sc, "KMeansModel")
    print(model.predict([1.0, 1.0]))
    print(model.predict([1.0, -2.0]))
Exemple #18
0
    def print_model(self,model_name):
        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        for c in model.clusterCenters:
            print(c)
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)
Exemple #19
0
def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Data
    dataset = sc.textFile(INPUT_DATA, cpu_count)
    dataset = dataset.map(
        lambda line: array([float(x) for x in line.split(';')]))

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    # Predict cluster labels per row
    labels = sameModel.predict(dataset).collect()

    # Save labels in json file
    with open(OUTPUT_LABEL, 'w') as out_f:
        json.dump(labels, out_f)
Exemple #20
0
def predict():
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.linalg import Vectors

    data = sqlContext.read.format("com.databricks.spark.csv").option(
        "header", "true").load("w2v_vector.csv")
    data = data.map(lambda x: [float(a) for a in x])
    df = data.toDF()
    columns = df.columns
    vectors = df.select(columns[1:71])
    vectors = vectors.map(lambda x: Vectors.dense(x))

    for n_clusters in _range:
        model = KMeansModel.load(
            sc, "hdfs:///user/rmusters/w2v_model_kmeans_" + str(n_clusters))

        predicted = model.predict(vectors)
        result = predicted.map(lambda x: (x, )).toDF()
        result.save("clusters_" + str(n_clusters))
def kmeans_lda_predict():
    appName = 'kmeans_lda_predict'
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
    sc = SparkContext(appName=appName)
    from pyspark.sql import SQLContext
    sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lda_doc_topic")  #lda_data_jan
    # data = df.rdd
    model = KMeansModel.load(sc, "hdfs:///user/rmusters/kmeans_lda_jan")
    data = data.map(lambda (id, vectors):
                    (id, vectors, model.predict(vectors)))
    df = data.toDF(["id", "vectors", "cluster"])
    df = df.sort(df.cluster.asc())
    # df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode= "overwrite")
    df.write.parquet("hdfs:///user/rmusters/lda_data_cluster",
                     mode="overwrite")

    logger.info(appName)
Exemple #22
0
    def predict(self, model_name, data):

        '''
        predict unknown data
        :param model_name: the trained model saving on hdfs
        :param data: unknown data
        :return: (cluster_index, cluster)
        '''

        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        # get the predict : means which cluster it belongs to
        index = model.predict(data)
        print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index))
        return index, model.clusterCenters[index]
def train_rotations(sc, split_vecs, M, Cs):
    """
    For compute rotations for each split of the data using given coarse quantizers.
    """

    Rs = []
    mus = []
    counts = []
    for split in xrange(2):

        print 'Starting rotation fitting for split %d' % split

        # Get the data for this split
        data = split_vecs.map(lambda x: x[split])

        # Get kmeans model
        model = KMeansModel(Cs[split])

        R, mu, count = compute_local_rotations(sc, data, model, M / 2)
        Rs.append(R)
        mus.append(mu)
        counts.append(count)

    return Rs, mus, counts
        
def parseOwnership(line):
    fields = line.split(',')
    #print len(fields)
    owner = fields[0]
    taxes = int(fields[1])
    lat = float(fields[2])
    lon = float(fields[3])
    cluster = int(fields[4])
    
    return (owner, taxes, lat, lon, cluster)

conf = SparkConf().setMaster("local").setAppName("tugasbigdata")
sc = SparkContext(conf = conf)

clusters = KMeansModel.load(sc,"C:/SparkCourse/FP_model")

lines = sc.textFile("file:///SparkCourse/data_center.csv")
parsedLines = lines.map(parseLine)
#bersih bersih
reserveddata= parsedLines.filter(lambda x : x is not None)
reserveddata1= reserveddata.filter(lambda x : x[1] is not None)
reserveddata2=reserveddata1.filter(lambda x : x[2] is not None)
temp=reserveddata2.filter(lambda x : "San Francisco" in x[0])
tempdata=temp.map(lambda x: (x[1],x[2]))
data=tempdata.map(lambda x: (float(x[0]),float(x[1])))

data_local = data.collect()

ownershipData = lines.map(parseOwner)
#bersih2 lagi
Exemple #25
0
def main():
    data_rdd = load_data(project["data_file"])
    print (data_rdd.count())

    listed_data_rdd = data_rdd.map(data_extractor)

    # Filtering unwanted data rows
    elect_filtered_rdd = listed_data_rdd.filter(electric_vehicles_filter)
    filtered_rdd = elect_filtered_rdd.filter(empty_cost_filter)

    # Mapping related data to convenient format for clustering
    cost_tx_rdd = filtered_rdd.map(cost_transform_mapper)
    feature_mapped_rdd = cost_tx_rdd.map(feature_mapper)

    estimated_clusters = KMeansModel.load(spark_context, "identified_clusters")

    optimum_cluster = 4
    optimum_points_rdd = feature_mapped_rdd.filter(
        lambda filtered_data_feature_vector: estimated_clusters.predict(filtered_data_feature_vector[-1])
        == optimum_cluster
    )

    print (optimum_points_rdd.count())

    sample_data = optimum_points_rdd.take(100)

    for data in sample_data:
        feature_vector = data[-1]
        pyplot.scatter(feature_vector[0], feature_vector[1])

    optimum_cluster_manufactures_rad = optimum_points_rdd.map(manufactures_mapper)

    optimum_points_rdd.persist()  # To hold the previously calculated data set in memory

    individual_manufactures_count_rdd = optimum_cluster_manufactures_rad.reduceByKey(operator.add)

    sorted_manufactures_count_rdd = individual_manufactures_count_rdd.sortBy(
        lambda manufactures_set: manufactures_set[1], ascending=False
    )

    top_ten = 10
    vehicle_count = []
    manufactures_name = []
    for manufacture in sorted_manufactures_count_rdd.take(top_ten):
        vehicle_count.append(manufacture[1])
        manufactures_name.append(manufacture[0])
        print (manufacture)

    pyplot.title("Best Vehicle Cluster")
    pyplot.xlabel("Europe Rating")
    pyplot.ylabel("Feature Normalized")

    pyplot.show()

    number_of_manufactures = top_ten  # sorted_manufactures_count_rdd.count()
    index = np.arange(number_of_manufactures)

    bar_width = 0.5

    opacity = 0.4
    error_config = {"ecolor": "0.3"}

    chart = pyplot.bar(
        index, vehicle_count, bar_width, alpha=opacity, color="b", error_kw=error_config, label="manufactures"
    )
    pyplot.xticks(index + bar_width, manufactures_name)

    pyplot.title("Top preforming vehicles")
    pyplot.xlabel("manufactures")
    pyplot.ylabel("Vehicles count")

    pyplot.show()
#print np.shape(parsedData)
# Build the model (cluster the data)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
	center = clusters.centers[clusters.predict(point)]
	#print center
	return sqrt(sum([x**2 for x in (point - center)]))
#WSSE = parsedData.map(lambda point:error(point)).reduce(lambda x,y:x+y)
WSSE=np.zeros(5)
import time
for i in range(2,7):
	t = time.time()
	clusters = KMeans.train(parsedData, i, maxIterations=100,
        	runs=100, initializationMode="random")
	WSSE[i] = (parsedData.map(lambda point:error(point)).reduce(lambda x,y :x+y))
	print str(WSSE[i])+"   "+str(i)+"   WITH TIME ="+str(time.time()-t)

#WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#print("Within Set Sum of Squared Error = " + str(WSSSE))



# Save and load model
clusters.save(sc, "./mymodel1")
sameModel = KMeansModel.load(sc, "./mymodel1")

#print clusters

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import KMeans, KMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    # $example off$

    sc.stop()
Exemple #28
0
from collections import OrderedDict
from numpy import array
from math import sqrt

if __name__ == "__main__":
    if (len(sys.argv) != 2):
        print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
          "predict.py kddcup.data.file"
        sys.exit(1)

    data_file = sys.argv[1]
    conf = SparkConf().setAppName("KDDCup99") \
      #.set("spark.executor.memory", "2g")

    sc = SparkContext(conf=conf)

    model = KMeansModel.load(sc, "best_model")

    clusters = model.clusterCenters

    with open(data_file) as file:
        for line in file:
            line_split = line.split(",")
            clean_line_split = [line_split[0]] + line_split[4:]
            clusterIndex = model.predict(
                array([float(x) for x in clean_line_split]))
            print clusterIndex
    print

print "DONE!"
Exemple #29
0
from pyspark import SparkContext, SparkConf
# Load and parse the data
# 데이터 로드 및 분석
conf = SparkConf().setMaster("local").setAppName("Test")
sc = SparkContext(conf=conf)
data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
# 모델 빌드(데이터 클러스터링)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=20,
                        runs=10,
                        initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
def process(id, content):
	kmeansDoc = DocumentKmeans(id, '')
	print(type(content))
	kmeansDoc.content = content
	kmeansDoc.doc2vec()
	kmeansDoc.printVec()
	kmeansDoc.kmeansVec()
	kmeansDoc.cluster_id = kmeansPredict.predict(kmeansDoc)
	return kmeansDoc
    
utils = Utils(45000)

sc = SparkContext(appName="PythonKafkaConsumerKmeans")

kmeansModel = KMeansModel.load(sc, '../KmeansModel')

kmeansPredict = KmeansPredict(kmeansModel)

consumer = KafkaConsumer('test', group_id='kafka_consumer_group', bootstrap_servers=['localhost:9092'])

for message in consumer:
	value = message.value
	spl = value.split(':')
	id = spl[0]
	content = utils.normalizeString(spl[1])
	docu = process(id, content)
	print(docu.cluster_id)
    
KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False)
KafkaConsumer(value_deserializer=lambda m: json.loads(unicode(m, "utf8")))
from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations. 
clusters = KMeans.train(parsedData, 2, 10)

# Evaluate the clustering
def error(point):
  center = clusters.centers[clusters.predict(point)]
  return sqrt(sum([x**2 for x in (point - center)]))
  
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))


# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
sameModel
sameModel.k
sameModel.clusterCenters

  


Exemple #32
0
 def kmeans_model_load(self, sc,path):
     return KMeansModel.load(sc, path)
 def __init__(self, spark_contect, model_path):
     self.model = KMeansModel.load(spark_contect, model_path)
input_living_index = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_living_index = sqlContext.read.parquet(input_living_index)
parquet_living_index.registerTempTable("living_index_table")
living_index_table = sqlContext.sql("SELECT * FROM living_index_table")
living_index_rdd = living_index_table.map(lambda colName: (str(colName.Community_Code) + "," + str(colName.Crime_Frequency)
                                                              + "," + str(colName.Housing_Crowded) + "," + str(colName.Household_BPL)
                                                              + "," + str(colName.Unemployed) + "," + str(colName.Without_Diploma)
                                                              + "," + str(colName.Age_Bar) + "," + str(colName.Per_Capita_Income)
                                                              + "," + str(colName.Hardship_Index)))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = living_index_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster model
clusters.save(sc, "myModel/living-index")
sameModel = KMeansModel.load(sc, "myModel/living-index")
Exemple #35
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark import SparkContext

sc = SparkContext(appName="AA_kmeans")

# Load and parse the data
data = sc.textFile("hdfs://namenode/kmeans_data.txt")
#print(data.take(5))
#print data.map(lambda line: array([x for x in line.split(' ')])).collect()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs://namenode/myModelPath")
sameModel = KMeansModel.load(sc, "hdfs://namenode/myModelPath")
from numpy import array
from math import sqrt


from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

from pyspark_cassandra import streaming

def predict(line):
    spl = line.split(':')
    doc = DocumentKmeans(spl[0], spl[1], spl[2])
    doc.kmeansVec()
    doc.cluster_id = kmeansPredict.predict(doc)
    return doc


sc = SparkContext()
sameModel = KMeansModel.load(sc, "../KmeansModel")
kmeansPredict = KmeansPredict(sameModel)

parsedData = sc.textFile('hdfs://localhost:8020/user/manh/vector')\
	.filter(lambda x: len(x) > 2000)\
	.map(lambda x: predict(x))\
	.map(lambda x: {
		'id' : x.id,
		'cluster_id' : int(x.cluster_id),
		'timestamps' : long(x.timestamps),
		'vector' : x.vector
	}).saveToCassandra('reishi', 'dockmeans')
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse the data
data = sc.textFile("/home/grijesh/sampleData/k-means-data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
                        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")
Exemple #38
0
if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("k_means_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData,
                            2,
                            maxIterations=10,
                            initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KMeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    # $example off$

    sc.stop()
        print arrays.collect()
        indx = 0
        while indx < count:
            vec = Vectors.dense(arrays.collect()[indx])
            indx += 1
            clusternum = model.predict(vec)
            print "Cluster -> ", clusternum, vec
    return


# Create a local StreamingContext with two working thread and batch interval of 1 second
conf = SparkConf().setAppName("Fraud Detector")
conf = conf.setMaster("local[2]")

sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 8999)
# Split each line into words

model = KMeansModel.load(sc, "kmeansmodel01")
print model.clusterCenters
print "************************** Loaded the model *********************"

words = lines.flatMap(lambda line: line.split(" "))

lines.foreachRDD(detect)
ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
from pyspark import SparkContext
# Load and parse the data
sc = SparkContext()

data = sc.textFile("/user/hduser/venkat/iris.txt")
print data.first()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        initializationMode="random")
prediction = clusters.predict(parsedData)
print clusters.centers


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
myModelPath = "/user/hduser/output/kmeans_output"
clusters.save(sc, myModelPath)
sameModel = KMeansModel.load(sc, myModelPath)
    # check input db
    input_db = os.path.join(args.input_root_dir, "dbs", "db", "out.parquet")
    if not os.path.isdir(input_db):
        raise Exception("missing db parquet directory")

    # check output dir
    # logger.debug("Create new codebook dir...")
    output_dir = os.path.join(args.input_root_dir, 'features', 'feature')
    if os.path.isdir(output_dir):
        new_name = output_dir + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
        logger.info("backup old output-dir in  %s" % new_name)
        os.rename(output_dir, os.path.join(args.input_root_dir, new_name))
    os.makedirs(output_dir)

    model = KMeansModel.load(sc, input_codebook)

    model = sc.broadcast(model)

    pooling="max"
    feature_name = "SIFT"
    df = sqc.read.parquet(input_db)

    print df.count()

    features_bow = df.map(functools.partial(compute_global_feature,
                                                feature_name="SURF",
                                                model=model,
                                                pooling=pooling))

    print features_bow.first()
Exemple #42
0

# Load and parse the data
# conf = SparkConf()
sc = SparkContext()
data = sc.textFile("./business_gps.csv")
parsedData = data.map(parse_line)

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        10,
                        maxIterations=1000,
                        runs=10,
                        initializationMode="k-means")


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "kmeans_model")
sameModel = KMeansModel.load(sc, "kmeans_model")

print("Cluster centers", sameModel.clusterCenters)
Exemple #43
0
conf = SparkConf().setAppName('KMeans').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data
data = sc.textFile('../data/kmeans_data.txt')
parseData = data.map(
    lambda line: np.array([float(x) for x in line.split(' ')]))

# build the model
clusters = KMeans.train(parseData,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode='random')


#evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parseData.map(lambda p: error(p)).reduce(lambda x, y: x + y)
print('Within Set Sum of Squared Error :' + str(WSSSE))

# save and load model
clusters.save(sc, '../model/KMeansModel')
sameModel = KMeansModel.load(sc, '../model/KMeansModel')

sc.stop()
        "hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv"
    )

    average_per_year = average_year(lines)  # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(),
                                                   k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data,
                                k,
                                maxIterations=100,
                                initialModel=KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model,
                                           count_lines)
    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)
    else:
        modelpath = ascontext.getModelContentToPath("model")
        model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0)

def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result


mapFn = lambda (x,y):rowToList(x)+[y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])
def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))
from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

def parse_line(ln):
	split_ln = ln.split(',')
	ln_coord = [float(split_ln[1]), float(split_ln[2])]
	new_line = ln + ',' + cluster_labels[ KMeans_model.predict(ln_coord)]
	return new_line

# load and parse the data
# conf = SparkConf()
sc = SparkContext()

# load previously generated k-means model
KMeans_model = KMeansModel.load(sc, "kmeans_model")

# define cluster label array
cluster_labels = ["Pheonix-AZ", "Edinburgh-UK", "Charlotte-NC", "Madison-WI", "Montreal-Canada", "Waterloo-Canada", "Las Vegas-NV", "Urbana-Champaign-IL", "Pittsburgh-PA", "Karlsruhe-Germany"]

# read the file which has business_ids, latitude, longitude
data = sc.textFile("./business_gps.csv")

# get labelled rows
parsedData = data.map(parse_line)

# save labelled businesses in the output folder
parsedData.saveAsTextFile("./output")
Exemple #48
0
from pyspark import SparkContext

from pyspark.mllib.clustering import KMeans, KMeansModel


if __name__ == "__main__":
    sc = SparkContext(appName="KMeansApp")  # SparkContext

   
    # Load and parse the data
    data = sc.textFile("s3://irm238FinalProject/input/citibike*")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10,
                            runs=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KmeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    

    sc.stop()
import json
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeansModel


def mapper(line):
    # Format the line
    line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    elements = line.split(",")
    stock_name = elements.pop(0)
    percent_changes = map(lambda x: float(x), elements)

    return stock_name, percent_changes


if __name__ == "__main__":
    sc = SparkContext(appName="ComputeResults")

    model = KMeansModel.load(sc, sys.argv[2])

    mapred_results = sc.textFile(sys.argv[1])
    clusters = mapred_results.map(mapper)\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda stock: (model.predict(array(stock[1])), [stock[0]]))\
        .reduceByKey(lambda a, b: a + b)\
        .collectAsMap()

    with open('result.json', 'w') as fp:
        json.dump(clusters, fp)
sqlContext = SQLContext(sc)

# Read the input parquet
input_crime = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_crime = sqlContext.read.parquet(input_crime)
parquet_crime.registerTempTable("crime_table")
crime_table = sqlContext.sql("SELECT * FROM crime_table")
crime_rdd = crime_table.map(lambda line: str(line.Year) + "," + str(line.Latitude) + ","
                                       + str(line.Longitude) + "," + str(line.Crime_Frequency))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = crime_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute root mean squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster output into parquet files
clusters.save(sc, "myModel_crime")
sameModel = KMeansModel.load(sc, "myModel_crime")
Exemple #51
0
if __name__ == "__main__":

    b = open("name", 'wb')
    sc = SparkContext("local[*]", "kmeans")

    print("data being loaded.....")
    data = sc.textFile(
        sys.argv[1]).map(lambda row: map(lambda x: float(x), row.split(',')))
    #file:///dev/desc_hdfs
    print("data loaded!")
    D = 128
    print("loading and counting")
    data_size = data.count()
    print("count done")
    print("model being loaded.....")
    model = KMeansModel.load(sc, sys.argv[2])
    print("model loaded!")

    centers = model.clusterCenters
    # ################SAMPLING##################################################
    #total_sampled_points = int(sys.argv[3])
    cluster = {}
    samples = {}
    print("data being stored in array....")
    #da = data.collect()
    print("data stored")

    n_clusters = model.k

    for j in range(n_clusters):
        cluster[j] = []
Exemple #52
0
    currTime = strftime("%Y-%m-%d-%H-%M-%S")
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv")
    dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv")
    predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv")

    average_per_year = average_year(lines) # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(), k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
    print("Prob: ", probabilities)
Exemple #53
0
# -*- coding: utf-8 -*-
from konlpy.tag import Twitter
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Normalizer
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt


sc = SparkContext()
sqlContext = SQLContext(sc)

normData = sc.pickleFile('idf_normalized')
clusters = KMeansModel.load('KMeasModel')
text = normData.map(lambda x : (x.no,x.eval_content))
data = normData.map(lambda x : (x.no,clusters.predict(x.idf_norm)) )
result = text.join(data).map(lambda (k, (left,right)) : (right,left.encode('uft-8')) )
for i in range(10):
	result.filter(lambda (x,y): x == i).map( lambda (x,y): y).saveAsTextFile("KMeansOutput/cluster_"+str(i))