#x for x in nameSet.toLocalIterator():
#	l.append(x2)
pandas.DataFrame(l).to_csv('/home/slic/name0503.csv')

# reecover from training data

def recover(x):
	x = x.strip().strip('()').split(',')
	s = LabeledPoint(float(x[0]),np.fromstring(','.join(x[1:]).strip('[]'), dtype=float, sep=',').tolist())
	return s

images = sc.binaryFiles("hdfs:///user/slic/output501")
data = images.values().map(recover)

# RDD to DataFrame
df = data.toDF()


 # ML
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(df)
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# Random forest    
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])



#get training set
c = getBag('/home/xhan/trainLabels.csv')
# generate filename to extract
filename = getFile(c,"hdfs:///user/hduser/train/")
# Input the images to the spark
images = sc.binaryFiles(filename)

# Map the input data
data = images.map(first)

#create datafrom from data RDD
df = data.toDF(['name','label','features'])

 # ML
 # refer to official tutorial 
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=5).fit(df)
(trainingData, testData) = df.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)