# else # point[c]=c1 # c1=avg(X where point[c]=c1) # c2=avg(X where point[c]=c2) #pyspark --packages com.databricks:spark-csv_2.10:1.3.0 import numpy as np import random from pyspark.sql import SQLContext sqlContext = SQLContext(sc) #first argument is data file name #path=sys.argv[1] dataFilePath="train.csv" trainDataFrame = sqlContext.csvFile(source="com.databricks.spark.csv", header="true", path = dataFilePath) trainDataFrame.registerTempTable("trainDataFrame") def isCentroid(point, centroidArray): i=1 for centroid in centroidArray: if(("%.6f" %point[0]=="%.6f" %centroid[1][0]) & ("%.6f" %point[1]=="%.6f" %centroid[1][1])): return "C"+str(i) i=i+1 return "Not Centroid" def getCentroid(point, centroidArray): res=isCentroid(point, centroidArray) if(res=="Not Centroid"): myCentroid=centroidArray[0]