Python KMeansModel Exemples, pyspark.mllib.clustering.KMeansModel Python Exemples

Exemple #1

0

Afficher le fichier

    def perform_training(sc: SparkContext, params_dict: dict):
        normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[
            'normal_ekg_data_path']
        min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters'])
        max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int(
            params_dict['max_num_of_clusters'])
        boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio'])

        ekg_rdd_data = sc.textFile(normal_ekg_data_path).map(
            lambda line: np.array([float(val) for val in line.split(',')]))

        # ekg_rdd_data.foreach(Plotter.plot_signal_window)
        k_range = range(min_num_of_clusters, max_num_of_clusters, 1)
        prev_cost = float(np.inf)
        final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1))
        cost_ratios = []
        found_best = False
        for k in k_range:
            km = KMeans.train(ekg_rdd_data, k)
            # cost equals to sum of squared distances of samples to the nearest cluster centre
            cost = km.computeCost(ekg_rdd_data)
            ratio = cost / prev_cost
            prev_cost = cost
            cost_ratios.append(ratio)
            if (ratio > boundary_ratio) & (not found_best):
                final_km = km
                found_best = True

        Plotter.plot_elbow(cost_ratios, k_range)
        return final_km

Exemple #2

0

Afficher le fichier

Fichier : k_means.py Projet : tsingfu/Spark-for-Python

def kmeans():
    """
    使用mllib对Spark安装包mllib的测试数据集做K-means聚类,由于train方法：
        Training points as an `RDD` of `Vector` or convertible
    所以需对数据集格式化：
        初始数据集 --> ['0.0 0.0 0.0', '0.1 0.1 0.1', '0.2 0.2 0.2']
        格式化后数据集 --> [array([0., 0., 0.]), array([0.1, 0.1, 0.1]), array([0.2, 0.2, 0.2])]
    :return:
    """
    data_rdd = sc.textFile('{}/mllib/kmeans_data.txt'.format(current_dir))
    parsed_data_rdd = data_rdd.map(lambda line: array([float(x) for x in line.split(' ')]))

    # 建立聚类模型
    clusters = KMeans.train(parsed_data_rdd, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x ** 2 for x in (point - center)]))

    WSSSE = parsed_data_rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # 保存 训练好的模型
    model_path = "{}/kmeans_model".format(current_dir)
    if not os.path.exists(model_path):
        clusters.save(sc, model_path)

    trained_model = KMeansModel.load(
        sc, "{}/kmeans_model".format(current_dir)
    )
    return trained_model

Exemple #3

0

Afficher le fichier

Fichier : test.py Projet : TcMysunshine/BiliSpark

def logo_feature_cluster(train_feature_list, train_name_list, clusternum):
    '''训练'''
    model = KMeans.train(sc.parallelize(train_feature_list),
                         clusternum,
                         maxIterations=10,
                         initializationMode="random",
                         seed=50,
                         initializationSteps=5,
                         epsilon=1e-4)
    model_path = tempfile.mkdtemp()
    model.save(sc, model_path)
    model = KMeansModel.load(sc, model_path)
    '''预测'''
    predict = model.predict(sc.parallelize(train_feature_list))
    # print(predict.collect())
    try:
        rmtree(model_path)
    except OSError:
        pass

    logo_result_path = os.path.join(
        result_path, "logo_image_result" + str(clusternum) + ".txt")
    writeResultTofile(logo_result_path, train_name_list, predict.collect())
    '''Calinski-Harabasz聚类评估指标'''
    # evaluationCH = metrics.calinski_harabaz_score(train_feature_list, predict.collect())
    # ch = str(round(evaluationCH, 2))
    # print("Calinski-Harabasz聚类评估指标:"+ch)
    # with open(result_path + "Calinski-Harabasz.txt", 'a') as a:
    #     a.write(str(clusternum)+":" + ch + "\n")
    '''Silhouette-Coefficient聚类评估指标'''

Exemple #4

0

Afficher le fichier

Fichier : vestAccountMain.py Projet : yfliu87/VestAccountDetection

def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel

Exemple #5

0

Afficher le fichier

Fichier : w2v_kmeans.py Projet : romusters/master/master-master/w2v_kmeans.py

def kmeans_w2v_predict():
    # appName='kmeans_w2v_predict'
    # sc = SparkContext(appName=appName)
    # from pyspark.sql import SQLContext
    from pyspark.mllib.clustering import KMeans, KMeansModel
    # sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lambert_w2v_data_jan")
    # df = data.toDF("text", "filtered_text", "split_text", "vectors", "id")
    df = data.toDF("tokens", "vectors", "id")
    df = df.where(df.vectors.isNotNull())
    data = df.rdd
    model = KMeansModel.load(sc,
                             "hdfs:///user/rmusters/lambert_kmeans_w2v_jan")

    # data = data.map(lambda (text, filtered_text, split_text, vectors, id): (text, filtered_text, split_text, vectors, model.predict(vectors), id))
    # df = data.toDF(["text", "filtered_text", "split_text", "vectors", "cluster", "id"])
    data = data.map(lambda (tokens, vectors, id):
                    (tokens, vectors, model.predict(vectors), id))
    df = data.toDF(["tokens", "vectors", "cluster", "id"])
    df = df.select("cluster", "id")
    df = df.sort(df.cluster.asc())
    df.write.format("com.databricks.spark.csv").mode("overwrite").save(
        "lambert_w2v_data_cluster.csv")
    # df.save("hdfs:///user/rmusters/lambert_w2v_data_cluster.csv", "com.databricks.spark.csv")
    df.write.parquet("hdfs:///user/rmusters/lambert_w2v_data_cluster",
                     mode="overwrite")

Exemple #6

0

Afficher le fichier

def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model

Exemple #7

0

Afficher le fichier

Fichier : GetCluster.py Projet : huangxi183/NeighborhoodFinder

def getCluster(price, crime, male, female, white, black, asian, hispanic,
               young, mid_age, senior):
    KModel = KMeansModel.load(sc, "project/data/output/KMeansModel")
    cluster = KModel.predict([
        price, crime, male, female, white, black, asian, hispanic, young,
        mid_age, senior
    ])
    return cluster

Exemple #8

0

Afficher le fichier

Fichier : fraud.py Projet : NeroJ/Kaggle-Competition

 def KMeans_Processing(self, columns):
     data_point = np.array(self.df_PD[columns])
     model = KMeansModel.load(
         self.sc, self.baseDir + '/fraudModel/Model/' + 'KMeans')
     result = np.array(
         model.predict(self.sc.parallelize(data_point)).collect())
     self.df_PD.insert(len(list(self.df_PD.columns)), 'KMeans_feature',
                       result)

Exemple #9

0

Afficher le fichier

def assign_pooling(data):

    image_name, feature_matrix = data[0]
    clusterCenters = data[1]

    feature_matrix = np.array(feature_matrix)

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        bow[k] = max(bow[k], dist)

    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]

Exemple #10

0

Afficher le fichier

Fichier : imageSequentialClustering.py Projet : oadekoya/sparkforImageProcessing

def assign_pooling(row, clusterCenters, pooling):
    image_name = row['fileName']
    feature_matrix = np.array(row['features'])
    clusterCenters = clusterCenters.value
    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    #print(image_name + " in group: " + str(group))
    return [(image_name, group)]

Exemple #11

0

Afficher le fichier

def main():
    sc = SparkContext(appName="tileMapper")
    print("I do all the input output jazz")

    ###########################################################################
    big_image = sc.binaryFiles("Reference/108103_sm.jpg")
    tile_avgs = big_image.flatMap(extract_opencv_tiles())
    #buckets = tile_avgs.collect()
    #print("Bucket",buckets)
    tileMap = tile_avgs.map(
        lambda l: [item for sublist in l for item in sublist])
    tileList = tileMap.collect()
    print("Tile Map", tileMap)
    print("Tile Map", tileMap.collect())
    print("Tile List", tileList)
    print("Tile LIst", type(tileList))
    ############################################################################

    clusterIndex = getIndex()
    kmModel = KMeansModel.load(sc, "myModelPath")
    readyToCombine = []
    currentRow = None
    noOfRow = 0
    noOfCol = 0
    firstTile = tileList[0]
    tileSize = firstTile[1]
    #Randomly Get small images using kmeans match
    for tile in tileList:
        if tile[0] == currentRow:
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
            noOfCol = noOfCol + 1
        else:
            currentRow = tile[0]
            noOfCol = 1
            noOfRow = noOfRow + 1
            currentRow = tile[0]
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
    #Put small images into the big image canvas

    canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8)

    #Print Image
    print("No. of Col", noOfCol)
    print("No. of Row", noOfRow)
    #print("Before Print, Check Once again",readyToCombine)
    mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow,
                             tileSize)

    print("Finished processing of image")
    cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)

Exemple #12

0

Afficher le fichier

def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    centers = sameModel.clusterCenters
    print("Cluster Centers: ")
    for n, center in enumerate(centers):
        out_f = OUTPUT_DATA + str(n) + "Cluster.csv"
        numpy.savetxt(out_f, center, newline=";")
        print(center)

Exemple #13

0

Afficher le fichier

Fichier : train_test.py Projet : sunwookimuiuc/CyberGIS-TerrainAnalysis

def main():
    modelname = sys.argv[1]
    tiffname = sys.argv[2]
    outputname = sys.argv[3]
    sc = SparkContext()
    model = KMeansModel.load(sc, modelname)
    dataset = gdal.Open(tiffname, GA_ReadOnly)
    x, y, data = train.tiff_to_array(dataset, train.weights)
    driver = dataset.GetDriver().ShortName
    clusterdata = sc.parallelize(data)
    result = np.array(clusterdata.map(lambda point: model.predict(point)).collect())
    write_to_tif(outputname, x, y, result, driver)

Exemple #14

0

Afficher le fichier

Fichier : oldImageClustering.py Projet : hsabiu/thesis-scripts

def assign_pooling(data):

    row = data[0]
    clusterCenters = data[1]
    pooling = data[2]

    image_name = row['fileName']
    feature_matrix = np.array(row['features'])

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]

Exemple #15

0

Afficher le fichier

def kmeans_classification(sc, c_tag, util):
    print 'data retrieving'
    _data_ = data_retriever(c_tag)
    print len(_data_)
    #rids = [x[0] for x in _data_]

    blc = nlpb.nlpblockbase()

    __ans = [train_feature_extraction(x, c_tag, blc, util) for x in _data_]
    ans = [[float(k) for k in x] for x in __ans]
    #print ans
    train_data = [np.array(sf.softmax(x)) for x in ans]
    #train_data = ans
    print 'dataprep done'
    assert_len(train_data)
    brotrain = sc.broadcast(train_data)
    clusters = KMeans.train(sc.parallelize(brotrain.value),
                            200,
                            maxIterations=10,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = map(lambda point: error(point), train_data)
    WSSSE = reduce(lambda x, y: x + y, WSSSE)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    clustered = collections.defaultdict(list)
    print len(_data_)
    cnt = 0
    for row in _data_:
        clustered[clusters.predict(train_data[cnt])].append(
            [row[0], row[2], row[3], row[4]])
        cnt += 1
    print len(clustered)
    for k in clustered.keys():
        print len(clustered[k])

    clusters.save(sc, "MovieModel")
    sameModel = KMeansModel.load(sc, "MovieModel")

    ref = collections.defaultdict(list)
    for point in train_data:
        ref[sameModel.predict(point)].append(point)

    for x, y in zip(ref.keys(), clustered.keys()):
        assert len(ref[x]) == len(clustered[y])

    return clustered

Exemple #16

0

Afficher le fichier

def kmeansInitialClusters(dataset):
    model = KMeansModel(CENTER_VECTORS)
    vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features'])))
    trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model)
    result=[]
    for d in dataset.collect():
        entry = {}
        entry["features"] = d["features"]
        entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features'])))
        entry["label"] = d['label']
        result.append(entry)

    plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters")
    centroidArtistSongCount(result, CENTERS)

Exemple #17

0

Afficher le fichier

def main(sc):
    data = [[1.0, 1.0], [1.0, 0.8], [-1.0, 1.0], [-1.0, -1.0]]
    parsedData = sc.parallelize(data)
    kmeansModel = KMeans.train(parsedData,
                               2,
                               maxIterations=10,
                               runs=10,
                               initializationMode="random")
    print(kmeansModel.predict([1.0, 1.0]))
    print(kmeansModel.predict([1.0, -2.0]))
    # Save and load model
    kmeansModel.save(sc, "KMeansModel")
    model = KMeansModel.load(sc, "KMeansModel")
    print(model.predict([1.0, 1.0]))
    print(model.predict([1.0, -2.0]))

Exemple #18

0

Afficher le fichier

Fichier : kmeans_analyse.py Projet : summer-apple/spark

    def print_model(self,model_name):
        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        for c in model.clusterCenters:
            print(c)
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)

Exemple #19

0

Afficher le fichier

def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Data
    dataset = sc.textFile(INPUT_DATA, cpu_count)
    dataset = dataset.map(
        lambda line: array([float(x) for x in line.split(';')]))

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    # Predict cluster labels per row
    labels = sameModel.predict(dataset).collect()

    # Save labels in json file
    with open(OUTPUT_LABEL, 'w') as out_f:
        json.dump(labels, out_f)

Exemple #20

0

Afficher le fichier

def predict():
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.linalg import Vectors

    data = sqlContext.read.format("com.databricks.spark.csv").option(
        "header", "true").load("w2v_vector.csv")
    data = data.map(lambda x: [float(a) for a in x])
    df = data.toDF()
    columns = df.columns
    vectors = df.select(columns[1:71])
    vectors = vectors.map(lambda x: Vectors.dense(x))

    for n_clusters in _range:
        model = KMeansModel.load(
            sc, "hdfs:///user/rmusters/w2v_model_kmeans_" + str(n_clusters))

        predicted = model.predict(vectors)
        result = predicted.map(lambda x: (x, )).toDF()
        result.save("clusters_" + str(n_clusters))

Exemple #21

0

Afficher le fichier

Fichier : lda_kmeans.py Projet : romusters/master/master-master/lda_kmeans.py

def kmeans_lda_predict():
    appName = 'kmeans_lda_predict'
    from pyspark.mllib.clustering import KMeans, KMeansModel
    from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
    sc = SparkContext(appName=appName)
    from pyspark.sql import SQLContext
    sqlContext = SQLContext(sc)
    data = sqlContext.read.parquet(
        "hdfs:///user/rmusters/lda_doc_topic")  #lda_data_jan
    # data = df.rdd
    model = KMeansModel.load(sc, "hdfs:///user/rmusters/kmeans_lda_jan")
    data = data.map(lambda (id, vectors):
                    (id, vectors, model.predict(vectors)))
    df = data.toDF(["id", "vectors", "cluster"])
    df = df.sort(df.cluster.asc())
    # df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode= "overwrite")
    df.write.parquet("hdfs:///user/rmusters/lda_data_cluster",
                     mode="overwrite")

    logger.info(appName)

Exemple #22

0

Afficher le fichier

Fichier : kmeans_analyse.py Projet : summer-apple/spark

    def predict(self, model_name, data):

        '''
        predict unknown data
        :param model_name: the trained model saving on hdfs
        :param data: unknown data
        :return: (cluster_index, cluster)
        '''

        # try to load the specified model
        path = self.base + model_name
        try:
            model = KMeansModel.load(self.sc, path)
        except:
            raise Exception('No such model found on hdfs!')

        # get the predict : means which cluster it belongs to
        index = model.predict(data)
        print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index))
        return index, model.clusterCenters[index]

Exemple #23

0

Afficher le fichier

Fichier : train_model.py Projet : li282886931/deepVideoAnalytics

def train_rotations(sc, split_vecs, M, Cs):
    """
    For compute rotations for each split of the data using given coarse quantizers.
    """

    Rs = []
    mus = []
    counts = []
    for split in xrange(2):

        print 'Starting rotation fitting for split %d' % split

        # Get the data for this split
        data = split_vecs.map(lambda x: x[split])

        # Get kmeans model
        model = KMeansModel(Cs[split])

        R, mu, count = compute_local_rotations(sc, data, model, M / 2)
        Rs.append(R)
        mus.append(mu)
        counts.append(count)

    return Rs, mus, counts

Exemple #24

0

Afficher le fichier

Fichier : PersebaranBisnis(2).py Projet : apasajaboleh1/big_data

        
def parseOwnership(line):
    fields = line.split(',')
    #print len(fields)
    owner = fields[0]
    taxes = int(fields[1])
    lat = float(fields[2])
    lon = float(fields[3])
    cluster = int(fields[4])
    
    return (owner, taxes, lat, lon, cluster)

conf = SparkConf().setMaster("local").setAppName("tugasbigdata")
sc = SparkContext(conf = conf)

clusters = KMeansModel.load(sc,"C:/SparkCourse/FP_model")

lines = sc.textFile("file:///SparkCourse/data_center.csv")
parsedLines = lines.map(parseLine)
#bersih bersih
reserveddata= parsedLines.filter(lambda x : x is not None)
reserveddata1= reserveddata.filter(lambda x : x[1] is not None)
reserveddata2=reserveddata1.filter(lambda x : x[2] is not None)
temp=reserveddata2.filter(lambda x : "San Francisco" in x[0])
tempdata=temp.map(lambda x: (x[1],x[2]))
data=tempdata.map(lambda x: (float(x[0]),float(x[1])))

data_local = data.collect()

ownershipData = lines.map(parseOwner)
#bersih2 lagi

Exemple #25

0

Afficher le fichier

Fichier : prediction.py Projet : tmkasun/bigdata_spark

def main():
    data_rdd = load_data(project["data_file"])
    print (data_rdd.count())

    listed_data_rdd = data_rdd.map(data_extractor)

    # Filtering unwanted data rows
    elect_filtered_rdd = listed_data_rdd.filter(electric_vehicles_filter)
    filtered_rdd = elect_filtered_rdd.filter(empty_cost_filter)

    # Mapping related data to convenient format for clustering
    cost_tx_rdd = filtered_rdd.map(cost_transform_mapper)
    feature_mapped_rdd = cost_tx_rdd.map(feature_mapper)

    estimated_clusters = KMeansModel.load(spark_context, "identified_clusters")

    optimum_cluster = 4
    optimum_points_rdd = feature_mapped_rdd.filter(
        lambda filtered_data_feature_vector: estimated_clusters.predict(filtered_data_feature_vector[-1])
        == optimum_cluster
    )

    print (optimum_points_rdd.count())

    sample_data = optimum_points_rdd.take(100)

    for data in sample_data:
        feature_vector = data[-1]
        pyplot.scatter(feature_vector[0], feature_vector[1])

    optimum_cluster_manufactures_rad = optimum_points_rdd.map(manufactures_mapper)

    optimum_points_rdd.persist()  # To hold the previously calculated data set in memory

    individual_manufactures_count_rdd = optimum_cluster_manufactures_rad.reduceByKey(operator.add)

    sorted_manufactures_count_rdd = individual_manufactures_count_rdd.sortBy(
        lambda manufactures_set: manufactures_set[1], ascending=False
    )

    top_ten = 10
    vehicle_count = []
    manufactures_name = []
    for manufacture in sorted_manufactures_count_rdd.take(top_ten):
        vehicle_count.append(manufacture[1])
        manufactures_name.append(manufacture[0])
        print (manufacture)

    pyplot.title("Best Vehicle Cluster")
    pyplot.xlabel("Europe Rating")
    pyplot.ylabel("Feature Normalized")

    pyplot.show()

    number_of_manufactures = top_ten  # sorted_manufactures_count_rdd.count()
    index = np.arange(number_of_manufactures)

    bar_width = 0.5

    opacity = 0.4
    error_config = {"ecolor": "0.3"}

    chart = pyplot.bar(
        index, vehicle_count, bar_width, alpha=opacity, color="b", error_kw=error_config, label="manufactures"
    )
    pyplot.xticks(index + bar_width, manufactures_name)

    pyplot.title("Top preforming vehicles")
    pyplot.xlabel("manufactures")
    pyplot.ylabel("Vehicles count")

    pyplot.show()

Exemple #26

0

Afficher le fichier

Fichier : kmeans.py Projet : Sapphirine/Safest-Route-Prediction-in-Urban-Areas

#print np.shape(parsedData)
# Build the model (cluster the data)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
	center = clusters.centers[clusters.predict(point)]
	#print center
	return sqrt(sum([x**2 for x in (point - center)]))
#WSSE = parsedData.map(lambda point:error(point)).reduce(lambda x,y:x+y)
WSSE=np.zeros(5)
import time
for i in range(2,7):
	t = time.time()
	clusters = KMeans.train(parsedData, i, maxIterations=100,
        	runs=100, initializationMode="random")
	WSSE[i] = (parsedData.map(lambda point:error(point)).reduce(lambda x,y :x+y))
	print str(WSSE[i])+"   "+str(i)+"   WITH TIME ="+str(time.time()-t)

#WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#print("Within Set Sum of Squared Error = " + str(WSSSE))



# Save and load model
clusters.save(sc, "./mymodel1")
sameModel = KMeansModel.load(sc, "./mymodel1")

#print clusters

Exemple #27

0

Afficher le fichier

Fichier : k_means_example.py Projet : 397090770/spark-2.2-for-hadoop-2.2

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import KMeans, KMeansModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
    # $example off$

    sc.stop()

Exemple #28

0

Afficher le fichier

from collections import OrderedDict
from numpy import array
from math import sqrt

if __name__ == "__main__":
    if (len(sys.argv) != 2):
        print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
          "predict.py kddcup.data.file"
        sys.exit(1)

    data_file = sys.argv[1]
    conf = SparkConf().setAppName("KDDCup99") \
      #.set("spark.executor.memory", "2g")

    sc = SparkContext(conf=conf)

    model = KMeansModel.load(sc, "best_model")

    clusters = model.clusterCenters

    with open(data_file) as file:
        for line in file:
            line_split = line.split(",")
            clean_line_split = [line_split[0]] + line_split[4:]
            clusterIndex = model.predict(
                array([float(x) for x in clean_line_split]))
            print clusterIndex
    print

print "DONE!"

Exemple #29

0

Afficher le fichier

from pyspark import SparkContext, SparkConf
# Load and parse the data
# 데이터 로드 및 분석
conf = SparkConf().setMaster("local").setAppName("Test")
sc = SparkContext(conf=conf)
data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
# 모델 빌드(데이터 클러스터링)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=20,
                        runs=10,
                        initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")

Exemple #30

0

Afficher le fichier

Fichier : kafkaConsumerKmeans.py Projet : manhcompany/manhdoi

def process(id, content):
	kmeansDoc = DocumentKmeans(id, '')
	print(type(content))
	kmeansDoc.content = content
	kmeansDoc.doc2vec()
	kmeansDoc.printVec()
	kmeansDoc.kmeansVec()
	kmeansDoc.cluster_id = kmeansPredict.predict(kmeansDoc)
	return kmeansDoc
    
utils = Utils(45000)

sc = SparkContext(appName="PythonKafkaConsumerKmeans")

kmeansModel = KMeansModel.load(sc, '../KmeansModel')

kmeansPredict = KmeansPredict(kmeansModel)

consumer = KafkaConsumer('test', group_id='kafka_consumer_group', bootstrap_servers=['localhost:9092'])

for message in consumer:
	value = message.value
	spl = value.split(':')
	id = spl[0]
	content = utils.normalizeString(spl[1])
	docu = process(id, content)
	print(docu.cluster_id)
    
KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False)
KafkaConsumer(value_deserializer=lambda m: json.loads(unicode(m, "utf8")))

Exemple #31

0

Afficher le fichier

Fichier : mllibsession.py Projet : raul-arrabales/Spark-Hands-on

from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations. 
clusters = KMeans.train(parsedData, 2, 10)

# Evaluate the clustering
def error(point):
  center = clusters.centers[clusters.predict(point)]
  return sqrt(sum([x**2 for x in (point - center)]))
  
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))


# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
sameModel
sameModel.k
sameModel.clusterCenters

Exemple #32

0

Afficher le fichier

Fichier : kmeans.py Projet : Ather23/machine_learning

 def kmeans_model_load(self, sc,path):
     return KMeansModel.load(sc, path)

Exemple #33

0

Afficher le fichier

Fichier : kmeans_engine.py Projet : Ather23/machine_learning

 def __init__(self, spark_contect, model_path):
     self.model = KMeansModel.load(spark_contect, model_path)

Exemple #34

0

Afficher le fichier

Fichier : kmeans-living_index.py Projet : mangeshb1/CMPT732Project

input_living_index = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_living_index = sqlContext.read.parquet(input_living_index)
parquet_living_index.registerTempTable("living_index_table")
living_index_table = sqlContext.sql("SELECT * FROM living_index_table")
living_index_rdd = living_index_table.map(lambda colName: (str(colName.Community_Code) + "," + str(colName.Crime_Frequency)
                                                              + "," + str(colName.Housing_Crowded) + "," + str(colName.Household_BPL)
                                                              + "," + str(colName.Unemployed) + "," + str(colName.Without_Diploma)
                                                              + "," + str(colName.Age_Bar) + "," + str(colName.Per_Capita_Income)
                                                              + "," + str(colName.Hardship_Index)))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = living_index_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster model
clusters.save(sc, "myModel/living-index")
sameModel = KMeansModel.load(sc, "myModel/living-index")

Exemple #35

0

Afficher le fichier

Fichier : aa_ml_1.py Projet : alvlandr/spark_test

from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark import SparkContext

sc = SparkContext(appName="AA_kmeans")

# Load and parse the data
data = sc.textFile("hdfs://namenode/kmeans_data.txt")
#print(data.take(5))
#print data.map(lambda line: array([x for x in line.split(' ')])).collect()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "hdfs://namenode/myModelPath")
sameModel = KMeansModel.load(sc, "hdfs://namenode/myModelPath")

Exemple #36

0

Afficher le fichier

Fichier : batchKmeansClustering.py Projet : manhcompany/manhdoi

from numpy import array
from math import sqrt


from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

from pyspark_cassandra import streaming

def predict(line):
    spl = line.split(':')
    doc = DocumentKmeans(spl[0], spl[1], spl[2])
    doc.kmeansVec()
    doc.cluster_id = kmeansPredict.predict(doc)
    return doc


sc = SparkContext()
sameModel = KMeansModel.load(sc, "../KmeansModel")
kmeansPredict = KmeansPredict(sameModel)

parsedData = sc.textFile('hdfs://localhost:8020/user/manh/vector')\
	.filter(lambda x: len(x) > 2000)\
	.map(lambda x: predict(x))\
	.map(lambda x: {
		'id' : x.id,
		'cluster_id' : int(x.cluster_id),
		'timestamps' : long(x.timestamps),
		'vector' : x.vector
	}).saveToCassandra('reishi', 'dockmeans')

Exemple #37

0

Afficher le fichier

Fichier : Kmeans-clustering.py Projet : grijeshsaini/spark-playground

from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse the data
data = sc.textFile("/home/grijesh/sampleData/k-means-data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10,
                        runs=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "myModelPath")
sameModel = KMeansModel.load(sc, "myModelPath")

Exemple #38

0

Afficher le fichier

if __name__ == "__main__":
    sc = SparkContext(appName="KMeansExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("k_means_data.txt")
    parsedData = data.map(
        lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData,
                            2,
                            maxIterations=10,
                            initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(
        lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KMeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    # $example off$

    sc.stop()

Exemple #39

0

Afficher le fichier

Fichier : transfrauddetect.py Projet : insanepopeye/Fruad_Detection_System_Using_Apache_Spark_Framework

        print arrays.collect()
        indx = 0
        while indx < count:
            vec = Vectors.dense(arrays.collect()[indx])
            indx += 1
            clusternum = model.predict(vec)
            print "Cluster -> ", clusternum, vec
    return


# Create a local StreamingContext with two working thread and batch interval of 1 second
conf = SparkConf().setAppName("Fraud Detector")
conf = conf.setMaster("local[2]")

sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 8999)
# Split each line into words

model = KMeansModel.load(sc, "kmeansmodel01")
print model.clusterCenters
print "************************** Loaded the model *********************"

words = lines.flatMap(lambda line: line.split(" "))

lines.foreachRDD(detect)
ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

Exemple #40

0

Afficher le fichier

Fichier : MLkmeans.py Projet : komuravellyv1/BigData-Projects

from pyspark import SparkContext
# Load and parse the data
sc = SparkContext()

data = sc.textFile("/user/hduser/venkat/iris.txt")
print data.first()
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        initializationMode="random")
prediction = clusters.predict(parsedData)
print clusters.centers


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
myModelPath = "/user/hduser/output/kmeans_output"
clusters.save(sc, myModelPath)
sameModel = KMeansModel.load(sc, myModelPath)

Exemple #41

0

Afficher le fichier

Fichier : create-global-features.py Projet : aymen82/SparkImageRecognition

    # check input db
    input_db = os.path.join(args.input_root_dir, "dbs", "db", "out.parquet")
    if not os.path.isdir(input_db):
        raise Exception("missing db parquet directory")

    # check output dir
    # logger.debug("Create new codebook dir...")
    output_dir = os.path.join(args.input_root_dir, 'features', 'feature')
    if os.path.isdir(output_dir):
        new_name = output_dir + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
        logger.info("backup old output-dir in  %s" % new_name)
        os.rename(output_dir, os.path.join(args.input_root_dir, new_name))
    os.makedirs(output_dir)

    model = KMeansModel.load(sc, input_codebook)

    model = sc.broadcast(model)

    pooling="max"
    feature_name = "SIFT"
    df = sqc.read.parquet(input_db)

    print df.count()

    features_bow = df.map(functools.partial(compute_global_feature,
                                                feature_name="SURF",
                                                model=model,
                                                pooling=pooling))

    print features_bow.first()

Exemple #42

0

Afficher le fichier


# Load and parse the data
# conf = SparkConf()
sc = SparkContext()
data = sc.textFile("./business_gps.csv")
parsedData = data.map(parse_line)

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        10,
                        maxIterations=1000,
                        runs=10,
                        initializationMode="k-means")


# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(sc, "kmeans_model")
sameModel = KMeansModel.load(sc, "kmeans_model")

print("Cluster centers", sameModel.clusterCenters)

Exemple #43

0

Afficher le fichier

conf = SparkConf().setAppName('KMeans').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data
data = sc.textFile('../data/kmeans_data.txt')
parseData = data.map(
    lambda line: np.array([float(x) for x in line.split(' ')]))

# build the model
clusters = KMeans.train(parseData,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode='random')


#evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parseData.map(lambda p: error(p)).reduce(lambda x, y: x + y)
print('Within Set Sum of Squared Error :' + str(WSSSE))

# save and load model
clusters.save(sc, '../model/KMeansModel')
sameModel = KMeansModel.load(sc, '../model/KMeansModel')

sc.stop()

Exemple #44

0

Afficher le fichier

Fichier : clustering.py Projet : Arq-Cliente-Servidor/SolarFlares

        "hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv"
    )

    average_per_year = average_year(lines)  # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(),
                                                   k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data,
                                k,
                                maxIterations=100,
                                initialModel=KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model,
                                           count_lines)

Exemple #45

0

Afficher le fichier

Fichier : kmeans_score.py Projet : IBMPredictiveAnalytics/K_Means_with_MLlib

    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)
    else:
        modelpath = ascontext.getModelContentToPath("model")
        model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0)

def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result


mapFn = lambda (x,y):rowToList(x)+[y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])

Exemple #46

0

Afficher le fichier

Fichier : clusterindex_vec.py Projet : gitofsid/MyBigDataCode

def main():
    k_input_model = sys.argv[1] #read kmean model from this location
    w_input_model = sys.argv[2] #read word2vec model from this location
    input_file = sys.argv[3] #read input file

    conf = SparkConf().setAppName('Clustering')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    sqlContext = SQLContext(sc)

    '''sbaronia - load both kmean and Word2Vec model'''
    kmean_model = KMeansModel.load(sc,k_input_model)
    word2vec_model = Word2VecModel.load(sc,w_input_model)

    '''sbaronia - select fields from json and make data frame zipped with index'''
    review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache()
    review_df = review.filter(review.reviewText != "").cache()

    rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
    rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()

    year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
    year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

    clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache()
       
    clean_list = clean_words_rdd.collect()

    '''sbaronia - make a list of all words in our model'''
    keys = sqlContext.read.parquet(w_input_model+"/data")
    keys_list = keys.rdd.map(lambda line: line.word).collect()

    '''sbaronia - here we create one vector per review, where vector
    contains the number of times a cluster is assinged to a word in
    a review. We make a SparseVector compatible format'''
    features = []

    for i in range(len(clean_list)):
        histogram = [0] * 2000
        for word in clean_list[i]:
            if word in keys_list:
                vec = word2vec_model.transform(word)
                clust = kmean_model.predict(vec)
                if histogram[clust] > 0:
                    histogram[clust] = histogram[clust] + 1
                else:
                    histogram[clust] = 1
        features.append((2000,range(2000),histogram))

    '''sbaronia - create a normalized SparseVector rdd'''
    nor = Normalizer(1)
    features_rdd = rdd_zip(sc.parallelize(features) \
                             .map(lambda line: nor.transform(SparseVector.parse(line))) \
                             .cache()).cache()

    '''sbaronia - make a dataframe with rating, year and vector per review'''
    features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache()

    year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
    featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \
                                 .drop(features_df.index).cache()
    
    '''sbaronia - create training and testing data based on year'''
    train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \
                            .select('rating','feature') \
                            .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                            .coalesce(1) \
                            .cache()
    
    test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \
                           .select('rating','feature') \
                           .map(lambda line: (LabeledPoint(line.rating, line.feature))) \
                           .coalesce(1) \
                           .cache()

    '''sbaronia - find best step using validation and run LinearRegressionWithSGD 
    with that step and report final RMSE'''
    step_best_norm = validation(train_rdd)

    RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm)

    print("Final RMSE(Normalization) = " + str(RMSE_norm) + "  Best Step size = " + str(step_best_norm))

Exemple #47

0

Afficher le fichier

Fichier : classify_cities.py Projet : NYcleaner/Yelp-Dataset-Challenge

from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

def parse_line(ln):
	split_ln = ln.split(',')
	ln_coord = [float(split_ln[1]), float(split_ln[2])]
	new_line = ln + ',' + cluster_labels[ KMeans_model.predict(ln_coord)]
	return new_line

# load and parse the data
# conf = SparkConf()
sc = SparkContext()

# load previously generated k-means model
KMeans_model = KMeansModel.load(sc, "kmeans_model")

# define cluster label array
cluster_labels = ["Pheonix-AZ", "Edinburgh-UK", "Charlotte-NC", "Madison-WI", "Montreal-Canada", "Waterloo-Canada", "Las Vegas-NV", "Urbana-Champaign-IL", "Pittsburgh-PA", "Karlsruhe-Germany"]

# read the file which has business_ids, latitude, longitude
data = sc.textFile("./business_gps.csv")

# get labelled rows
parsedData = data.map(parse_line)

# save labelled businesses in the output folder
parsedData.saveAsTextFile("./output")

Exemple #48

0

Afficher le fichier

Fichier : clustering.py Projet : lanimc/BIGDATAProject

from pyspark import SparkContext

from pyspark.mllib.clustering import KMeans, KMeansModel


if __name__ == "__main__":
    sc = SparkContext(appName="KMeansApp")  # SparkContext

   
    # Load and parse the data
    data = sc.textFile("s3://irm238FinalProject/input/citibike*")
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10,
                            runs=10, initializationMode="random")

    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "KmeansModel")
    sameModel = KMeansModel.load(sc, "KMeansModel")
    

    sc.stop()

Exemple #49

0

Afficher le fichier

Fichier : results.py Projet : TrevorStubblefield/stock-kmeans

import json
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeansModel


def mapper(line):
    # Format the line
    line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    elements = line.split(",")
    stock_name = elements.pop(0)
    percent_changes = map(lambda x: float(x), elements)

    return stock_name, percent_changes


if __name__ == "__main__":
    sc = SparkContext(appName="ComputeResults")

    model = KMeansModel.load(sc, sys.argv[2])

    mapred_results = sc.textFile(sys.argv[1])
    clusters = mapred_results.map(mapper)\
        .reduceByKey(lambda a, b: a + b)\
        .map(lambda stock: (model.predict(array(stock[1])), [stock[0]]))\
        .reduceByKey(lambda a, b: a + b)\
        .collectAsMap()

    with open('result.json', 'w') as fp:
        json.dump(clusters, fp)

Exemple #50

0

Afficher le fichier

Fichier : kmeans-crime.py Projet : mangeshb1/SFUCMPT732_Big_Data_Lab_1

sqlContext = SQLContext(sc)

# Read the input parquet
input_crime = sys.argv[1]

# Read the parquet data and convert to RDD
parquet_crime = sqlContext.read.parquet(input_crime)
parquet_crime.registerTempTable("crime_table")
crime_table = sqlContext.sql("SELECT * FROM crime_table")
crime_rdd = crime_table.map(lambda line: str(line.Year) + "," + str(line.Latitude) + ","
                                       + str(line.Longitude) + "," + str(line.Crime_Frequency))

# K-means does multiple runs to find the optimal cluster center, so cache the input to K-means
cluster_input = crime_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache()

# Perform K-means clustering
clusters = KMeans.train(cluster_input, 20, maxIterations=5,
        runs=5, initializationMode="random")

# Compute root mean squared error and change cluster centers
def squared_error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y)
print("Squared error for a cluster = " + str(error))

# Save the cluster output into parquet files
clusters.save(sc, "myModel_crime")
sameModel = KMeansModel.load(sc, "myModel_crime")

Exemple #51

0

Afficher le fichier

if __name__ == "__main__":

    b = open("name", 'wb')
    sc = SparkContext("local[*]", "kmeans")

    print("data being loaded.....")
    data = sc.textFile(
        sys.argv[1]).map(lambda row: map(lambda x: float(x), row.split(',')))
    #file:///dev/desc_hdfs
    print("data loaded!")
    D = 128
    print("loading and counting")
    data_size = data.count()
    print("count done")
    print("model being loaded.....")
    model = KMeansModel.load(sc, sys.argv[2])
    print("model loaded!")

    centers = model.clusterCenters
    # ################SAMPLING##################################################
    #total_sampled_points = int(sys.argv[3])
    cluster = {}
    samples = {}
    print("data being stored in array....")
    #da = data.collect()
    print("data stored")

    n_clusters = model.k

    for j in range(n_clusters):
        cluster[j] = []

Exemple #52

0

Afficher le fichier

    currTime = strftime("%Y-%m-%d-%H-%M-%S")
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv")
    dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv")
    predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv")

    average_per_year = average_year(lines) # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(), k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
    print("Prob: ", probabilities)

Exemple #53

0

Afficher le fichier

Fichier : KMeans_test.py Projet : inyeoplee77/IRProject

# -*- coding: utf-8 -*-
from konlpy.tag import Twitter
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Normalizer
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt


sc = SparkContext()
sqlContext = SQLContext(sc)

normData = sc.pickleFile('idf_normalized')
clusters = KMeansModel.load('KMeasModel')
text = normData.map(lambda x : (x.no,x.eval_content))
data = normData.map(lambda x : (x.no,clusters.predict(x.idf_norm)) )
result = text.join(data).map(lambda (k, (left,right)) : (right,left.encode('uft-8')) )
for i in range(10):
	result.filter(lambda (x,y): x == i).map( lambda (x,y): y).saveAsTextFile("KMeansOutput/cluster_"+str(i))