def naivebayes_mllib(): AWS_ACCESS_KEY_ID = "XXXXXXXXXXXXXXXXXX" AWS_SECRET_ACCESS_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY) tr_folder = "s3n://usf-ml2/hwspark/train/" tr_neg_path = tr_folder+ "neg/*.txt" neg_files = sc.textFile(tr_neg_path) neg = neg_files.map(lambda x: parsedoc(x)) neg = neg.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) neg1= neg.flatMap(lambda x:x.split()) neg1 = neg1.map(lambda x: removeStopWords(x)) tf = HashingTF().transform(neg1.map(lambda x: x, preservesPartitioning=True)) neg_tr = tf.map(lambda x: LabeledPoint(0.0, x)) tr_pos_path = tr_folder+ "pos/*.txt" pos_files = sc.textFile(tr_pos_path) pos = pos_files.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) pos = pos.map(lambda x: parsedoc(x)) pos1= pos.flatMap(lambda x:x.split()) pos1 = pos1.map(lambda x: removeStopWords(x)) tf_pos = HashingTF().transform(pos1.map(lambda x: x, preservesPartitioning=True)) pos_tr = tf_pos.map(lambda x: LabeledPoint(1.0, x)) training = neg_tr.union(pos_tr) model = NaiveBayes.train(training) te_folder = "s3n://usf-ml2/hw_spark/test/" test_Npath = te_folder+"neg/*.txt" test_Ppath = te_folder+ "pos/*.txt" test = sc.textFile(test_Npath) test_p = sc.textFile(test_Ppath) test = test.map(lambda x: parsedoc(x)) test2= test.flatMap(lambda x:x.split()) test1 = test2.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) test2 = test1.map(lambda x: removeStopWords(x)) tf1 = HashingTF().transform(test2.map(lambda x: x, preservesPartitioning=True)) test5 = tf1.map(lambda x: LabeledPoint(0.0, x)) test_p = test_p.map(lambda x: parsedoc(x)) test_p1 = test_p.map(lambda x: x.replace(',',' ').replace('.', ' ').replace('-',' ').lower()) test_p2= test_p1.flatMap(lambda x:x.split()) test_p2 = test_p2.map(lambda x: removeStopWords(x)) tf_p1 = HashingTF().transform(test_p2.map(lambda x: x, preservesPartitioning=True)) test_p5 = tf_p1.map(lambda x: LabeledPoint(1.0, x)) testpn = test5.union(test_p5) predictionAndLabel = testpn.map(lambda p: (model.predict(p.features), p.label)) accuracy = predictionAndLabel.filter(lambda (x, v): x == v).count()*1.0 /float(test2.count()+test_p2.count()) print "Accuracy is {}".format(round(accuracy,5))
def create_labelPoints(rawdata, label): tf = HashingTF().transform( rawdata.map(lambda doc: doc[1].lower().split(" "), preservesPartitioning=True)) return(tf.map(lambda x: LabeledPoint(label, x)))