def main():
    """
	Driver program for a spam filter using Spark and MLLib
	"""

    # Create the Spark Context for parallel processing
    sc = SparkContext(appName="Spam Filter")

    # Load the spam and ham data files into RDDs
    spam = sc.textFile(
        "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20050311_spam_2.tar\\20050311_spam_2\\spam.txt"
    )
    ham = sc.textFile(
        "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20030228_easy_ham.tar\\20030228_easy_ham\\ham.txt"
    )

    # Create a HashingTF instance to map email text to vectors of 10,000 features.
    tf = HashingTF(numFeatures=10000)

    # Each email is split into words, and each word is mapped to one feature.
    spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
    hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

    # Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
    positiveExamples = spamFeatures.map(
        lambda features: LabeledPoint(1, features))
    negativeExamples = hamFeatures.map(
        lambda features: LabeledPoint(0, features))

    # Combine positive and negative datasets into one
    data = positiveExamples.union(negativeExamples)

    # Split the data into 70% for training and 30% test data sets
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Cache the training data to optmize the Logistic Regression
    trainingData.cache()

    # Train the model with Logistic Regression using the SGD algorithm.
    model = LogisticRegressionWithSGD.train(trainingData)

    # Create tuples of actual and predicted values
    labels_and_predictions = testData.map(
        lambda email: (email.label, model.predict(email.features)))

    # Calculate the error rate as number wrong / total number
    error_rate = labels_and_predictions.filter(
        lambda (val, pred): val != pred).count() / float(testData.count())

    # End the Spark Context
    sc.stop()

    #  Print out the error rate
    print("*********** SPAM FILTER RESULTS **********")
    print("\n")
    print("Error Rate: " + str(error_rate))
    print("\n")

    # Serialize the model for presistance
    pickle.dump(model, open("spamFilter.pkl", "wb"))
Beispiel #2
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Beispiel #3
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Beispiel #4
0
def trainevaluatemodel_logit(model,traindata,validationdata, iterations, step, minibatchfraction,regparam):
    starttime=time()
    model=LogisticRegressionWithSGD.train(traindata, iterations=iterations, step=step, miniBatchFraction=minibatchfraction, initialWeights=None, regParam=regparam, regType='l2', intercept=False, validateData=True, convergenceTol=0.001)
    index=evaluation2(model,validationdata)
    duration=time()-starttime
    print('Param:'+'\n'+'iterations:'+str(iterations)+'\n'+'step:'+str(step)+'\n'+'minibatchfraction:'+str(minibatchfraction)+'\n'+'regparam:'+str(regparam)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index))
    return (iterations, step, minibatchfraction,regparam,duration,index)
Beispiel #5
0
def main():
    MakePixelFileFromImages("./CarData/TrainImages/*pgm")
    sc = SparkContext(appName="Image Classifier 01")

    p = sc.textFile("pos.csv")
    n = sc.textFile("neg.csv")

    pFeatures = p.map(lambda image: image.split(","))
    nFeatures = n.map(lambda image: image.split(","))

    pExamples = pFeatures.map(lambda features: LabeledPoint(1, features))
    nExamples = nFeatures.map(lambda features: LabeledPoint(0, features))

    data = pExamples.union(nExamples)
    (trainingData, testData) = data.randomSplit([0.7,0.3])

    trainingData.cache()

    model = LogisticRegressionWithSGD.train(trainingData)
    labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features)))
    error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count())

    print("************* RESULTS *******************")
    print("Error Rate: " + str(error_rate))

    pickle.dump(model, open("imageModel.pk1","wb"))

    sc.stop()
Beispiel #6
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
def modelWithLogisticRegression(trainingData, validationData):

	##Train the model using Logistic Regression that employs Stochastic Gradient Descent
	##with different sets of parameters (i.e the value of lambda and the learning step size.
	##Return the LR model with best accuracy rate
	
	#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
	regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.]
	bestLRModel = None
	bestAccuracy = 0
	numOfIterations = 200
	visualizationData = []
	
	
	for regularizer in regularizationParamater:

		model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestLRModel = model
				
	return bestLRModel, visualizationData
Beispiel #8
0
def trainAndTestLG(train_lbl_vec, test_lbl_vec, regParam, lastTime):

    # create LabeledPoints for training
    lblPnt = train_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    # train the model
    #categoricalFeaturesInfo={} # no categorical features
    model = LogisticRegressionWithSGD.train(lblPnt,
                                            miniBatchFraction=0.1,
                                            regType='l1',
                                            intercept=True,
                                            regParam=regParam)

    # evaluate training
    resultsTrain = lblPnt.map(lambda lp:
                              (lp.label, model.predict(lp.features)))

    resultMap = resultsTrain.countByValue()

    # print 'TRAIN '
    trainAccuracy = accuracy(resultMap)

    # test the model
    data = test_lbl_vec.map(lambda (x, l): LabeledPoint(x, l))

    resultsTest = data.map(lambda lp: (lp.label, model.predict(lp.features)))
    resultMapTest = resultsTest.countByValue()
    testAccuracy = accuracy(resultMapTest)

    thisTime = time()
    elapsedTime = thisTime - lastTime
    return [elapsedTime, trainAccuracy, testAccuracy]
Beispiel #9
0
def modelWithLogisticRegression(trainingData, validationData):

    ##Train the model using Logistic Regression that employs Stochastic Gradient Descent
    ##with different sets of parameters (i.e the value of lambda and the learning step size.
    ##Return the LR model with best accuracy rate

    #eta = [0.1, 0.3, 0.5, 1.0, 5.0]
    regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.]
    bestLRModel = None
    bestAccuracy = 0
    numOfIterations = 200
    visualizationData = []

    for regularizer in regularizationParamater:

        model = LogisticRegressionWithSGD.train(trainingData,
                                                numOfIterations,
                                                1.0,
                                                regParam=regularizer)
        predict = validationData.map(lambda ad:
                                     (ad.label, model.predict(ad.features)))
        totalValidationAds = validationData.count()
        correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
        accuracy = float(correctlyPredicted) / totalValidationAds

        visualizationData += [(regularizer, accuracy)]

        if accuracy > bestAccuracy:
            bestAccuracy = accuracy
            bestLRModel = model

    return bestLRModel, visualizationData
Beispiel #10
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
Beispiel #11
0
def train_committee(train_features, test_features, size=5):
    committee = []
    attempts = 0
    max_attempts = size * 4
    roc_threshold = 0.7

    test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True))
    test_labeled_pairs = test_pairs_features.map(to_labeled_point)

    while len(committee) < size and attempts < max_attempts:
        attempts += 1

        pairs_features = train_features.map(lambda p: process_batch(p, is_train=True))
        labeled_points = pairs_features.map(to_labeled_point).sample(True, 1)

        model = LogisticRegressionWithSGD.train(labeled_points)
        model.clearThreshold()
        scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label))

        metrics = BinaryClassificationMetrics(scores_and_labels)
        if metrics.areaUnderROC > roc_threshold:
            print(attempts, metrics.areaUnderROC)
            committee.append(model)

    return committee
def processData(sc):
    #load and parse the data
    raw_data = sc.textFile(DATA_FILE)
    raw_data.persist()

    print "Train data size {}".format(raw_data.count())
    # map data to a format needed for logistic regression
    parsedData = raw_data.map(mapper)

    print "Sample of input to algorithm ", parsedData.take(10)

    # Train model
    t0 = time()
    model = LogisticRegressionWithSGD.train(parsedData)
    t1 = time() - t0
    print "Classifier trained in {} seconds".format(round(t1, 3))

    labelsAndPreds = parsedData.map(
        lambda point: (point.label, model.predict(point.features)))

    # Evaluating the model on training data
    trainErr = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(
        parsedData.count())

    # Print some stuff
    print("Training Error = " + str(trainErr))

    print "*************************** TESTING NOW ***********************"

    preds = parsed_test_data.map(lambda point: model.predict(point))

    with open('/home/ashish/Desktop/preds.pickle', 'wb') as f:
        pickle.dump(preds.collect(), f)
def trainEvaluateModel(trainData, validationData, numIterationsParm,
                       stepSizeParm, miniBatchFractionParm):
    '''
    训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。
    我们以图表显示这些参数值、准确率与训练所需的时间。
    我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下:
    (1)用LogisticRegressionWithSGD.train进行训练传入trainData与单个参数的不同数值;
    (2)建立模型后,用validationData评估模型的AUC准确率;
    (3)训练与评估模型重复执行多次,产生多个参数项的AUC与运行时间,并存储于metricsRDD中;
    (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame;
    (5)Pandas DataFrame可绘制AUC与运行时间图表,用于显示不同参数的准确率与执行时间的关系。
    :param trainData:
    :param validationData:
    :param numIterationsParm:
    :param stepSizeParm:
    :param miniBatchFractionParm:
    :return:
    '''
    print('======================= 训练评估模型 =======================')
    startTime = time()
    model = LogisticRegressionWithSGD.train(trainData, numIterationsParm,
                                            stepSizeParm,
                                            miniBatchFractionParm)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:numIterations=' +
          str(numIterationsParm) + ', stepSize=' + str(stepSizeParm) +
          ', miniBatchFraction=' + str(miniBatchFractionParm) + '\n' +
          '\t\t==>> 所需时间=' + str(duration) + ', 结果AUC=' + str(AUC))
    return (AUC, duration, numIterationsParm, stepSizeParm,
            miniBatchFractionParm, model)
Beispiel #14
0
def trainEvaluateModel(trainData, validationData, numIterations, stepSize, miniBatchFraction):
    startTime = time()
    model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, miniBatchFraction)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:numIterations->", numIterations, ", stepSize->", stepSize, ", miniBatchFraction->", miniBatchFraction)
    print("==> 所需时间:", duration, "s ,AUC=", AUC)
    return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def trian_model(spam,nospam):
    spam_features = tf.transform(spam)
    spam_label = spam_features.map(lambda f: LabeledPoint(1,f))
    nospam_features = tf.transform(nospam)
    nospam_label = nospam_features.map(lambda f: LabeledPoint(0,f))
    train_data = spam_label.union(nospam_label)
    model = LogisticRegressionWithSGD.train(train_data)
    return model
Beispiel #16
0
def logistic_l2_accuracy(x_train, x_test, regParam):
    # cache data to get reasonable speeds for methods like LogisticRegression and SVM
    xc = x_train.cache()
    # training logistic regression with L2 regularization
    model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2")
    # making prediction on x_test
    yhat  = x_test.map(lambda p: (p.label, model.predict(p.features)))
    # returning accuracy on x_test
    return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
def main(sc):	
	train_data = sc.textFile("input/ctc_data.txt").map(parsePoint)	
	parsedTrainData = train_data.randomSplit(weights=[0.2, 0.8])	
	start = time.time()	
	model = LogisticRegressionWithSGD.train(parsedTrainData)
	end = time.time()	
	time_elapsed = end - start
	output = "\nusing SGD " + str(time_elapsed)		
	print output	
def trainEvaluateModel(trainData, validationData, numInterations, stepSize, minibatchFaction):
    startTime = time()

    model = LogisticRegressionWithSGD.train(trainData,numInterations,stepSize,minibatchFaction)
    # model = LogisticRegressionWithSGD.train(trainData)
    # model = LogisticRegressionWithLBFGS(trainData,numInterations,stepSize,minibatchFaction)
    AUC = evaluateModel(model,validationData)
    durintation = time() - startTime
    print 'durintation' + str(durintation)
    return (AUC, numInterations, stepSize, minibatchFaction, model)
Beispiel #19
0
def lr(trainingData,testData,trainingSize,testSize):
  '''
  linear lr classifier
  '''
  # train a lr model
  numIterValList = [100,200]
  regParamValList = [0.01,0.1,1,10,100]
  stepSizeValList = [0.1,0.5,1]
  regTypeValList = ['l2','l1']

  # variable for the best parameters
  bestNumIterVal = 200
  bestRegParamVal = 0.01
  bestStepSizeVal = 1
  bestRegTypeVal = 'l2'
  bestTrainErr = 100

  for numIterVal,regParamVal,stepSizeVal,regTypeVal in itertools.product(numIterValList,regParamValList,stepSizeValList,regTypeValList):
    model = LogisticRegressionWithSGD.train(trainingData, iterations=numIterVal, regParam=regParamVal, step=stepSizeVal, regType=regTypeVal)
    labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize)
    if trainErr<bestTrainErr:
      bestNumIterVal = numIterVal
      bestRegParamVal = regParamVal
      bestStepSizeVal = stepSizeVal
      bestRegTypeVal = regTypeVal
      bestTrainErr = trainErr
    print numIterVal,regParamVal,stepSizeVal,regTypeVal,trainErr
  print bestNumIterVal,bestRegParamVal,bestStepSizeVal,bestRegTypeVal,bestTrainErr

  model = LogisticRegressionWithSGD.train(trainingData, iterations=bestNumIterVal, regParam=bestRegParamVal, step=bestStepSizeVal, regType=bestRegTypeVal)

  # Evaluating the model on training data
  labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
  trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingSize)
  print trainErr

  # Evaluating the model on training data
  labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
  testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testSize)
  print testErr
  pass
Beispiel #20
0
def LR_train(data):
    data_train = split_data(data)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_LR = LogisticRegressionWithSGD.train(training, 10)
    predictionAndlabel = test.map(
        lambda x: (float(model_LR.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_LR:%f" % accuracy)
    return model_LR, accuracy
Beispiel #21
0
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
Beispiel #22
0
def logistic_l2_accuracy(x_train, x_test, regParam):
    # cache data to get reasonable speeds for methods like LogisticRegression and SVM
    xc = x_train.cache()
    # training logistic regression with L2 regularization
    model = LogisticRegressionWithSGD.train(xc,
                                            regParam=regParam,
                                            regType="l2")
    # making prediction on x_test
    yhat = x_test.map(lambda p: (p.label, model.predict(p.features)))
    # returning accuracy on x_test
    return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
Beispiel #23
0
def task2():
    #Print title with Machine Learning Classification
    print("-------------------------------------------")
    startTitle = time.time()
    regex1 = re.compile(".*(title:).*")
    find1 = [m.group(0) for l in data for m in [regex1.search(l)] if m]
    title = [i.split('title: ', 1)[1] for i in find1]

    Programming = sc.textFile(fileProgramming)
    Other = sc.textFile(fileOther)

    # Create a HashingTF instance to map title text to vectors of 100,000 features.
    tf = HashingTF(numFeatures=100000)

    # Each title is split into words, and each word is mapped to one feature.
    programmingFeatures = Programming.map(
        lambda title: tf.transform(title.split(" ")))
    otherFeatures = Other.map(lambda title: tf.transform(title.split(" ")))

    # Create LabeledPoint datasets for positive (programming) and negative (other) examples.
    positiveExamples = programmingFeatures.map(
        lambda features: LabeledPoint(1, features))
    negativeExamples = otherFeatures.map(
        lambda features: LabeledPoint(0, features))
    trainingData = positiveExamples.union(negativeExamples)
    trainingData.cache()

    # Run Logistic Regression using the SGD algorithm.
    model = LogisticRegressionWithSGD.train(trainingData)

    listResult = []

    for row in title:
        test = tf.transform(row.split(" "))
        result = "null"
        if model.predict(test) == 1:
            result = "Programmings"
        else:
            result = "Non-Programming"
        joinResult = row + " = " + result
        listResult.append(joinResult)

    for i in listResult:
        if 'Non-Programming' in i:
            print(i)

    for i in listResult:
        if 'Programmings' in i:
            print(i)

    endTitle = time.time()
    elapsedTitle = endTitle - startTitle
    print(elapsedTitle)
    print("-------------------------------------------")
Beispiel #24
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Beispiel #25
0
 def update(rdd):
     # LogisticRegressionWithSGD.train raises an error for an empty RDD.
     if not rdd.isEmpty():
         self._model = LogisticRegressionWithSGD.train(
             rdd,
             self.numIterations,
             self.stepSize,
             self.miniBatchFraction,
             self._model.weights,
             regParam=self.regParam,
             convergenceTol=self.convergenceTol)
def main(sc):
    data = [
        LabeledPoint(0.0, [0.0, 1.0]),
        LabeledPoint(1.0, [1.0, 0.0])
        ]
    lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
    print (lrm.predict([1.0, 0.0]))
    print(lrm.predict([0.0, 1.0]))
    # Save and load model
    lrm.save(sc, "lrsgd")
    sameModel = LogisticRegressionModel.load(sc, "lrsgd")
    print(sameModel.predict([1.0, 0.0]))
    print(sameModel.predict([0.0, 1.0]))
Beispiel #27
0
def ml_lost():
    from pyspark.mllib.linalg import SparseVector
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.classification import SVMWithSGD
    from pyspark.mllib.classification import LogisticRegressionWithSGD
    from pyspark.ml.classification import LogisticRegression
    from pyspark import SparkContext
    lost_array = []
    attr_list = get_attr_list()
    print "get attr list=%s" % str(attr_list)
    lost_sum = 0
    for user in role_detail_dict.values():
        if user.roleid == 0:
            continue
        #每个用户的各个属性的list
        ratio_array = normalize_detail(user, attr_list)
        unlost_value = 1
        if user.is_lost:
            unlost_value = -1
            lost_sum += 1
        lost_array.append(LabeledPoint(unlost_value, ratio_array))

    sc = SparkContext(appName="lost_statis")
    sc.setLogLevel('ERROR')
    #svm = SVMWithSGD.train(sc.parallelize(lost_array,2), iterations=10)
    parall = sc.parallelize(lost_array)
    svm = SVMWithSGD.train(parall, iterations=10)
    print svm
    svm_weight = list(getattr(svm, "_coeff"))
    svm_weight_dict = {}
    seq = 0
    print "======svm weight len==%d" % svm_weight.__len__()
    for attr in attr_list:
        svm_weight_dict[attr] = svm_weight[seq]
        seq += 1
    print svm_weight_dict
    lrm = LogisticRegressionWithSGD.train(parall, iterations=10)
    print lrm
    lrm_weight = list(getattr(lrm, "_coeff"))
    lrm_weight_dict = {}
    print "======lrm weight len==%d" % lrm_weight.__len__()
    seq = 0
    for attr in attr_list:
        lrm_weight_dict[attr] = lrm_weight[seq]
        seq += 1
    print lrm_weight_dict
    all_detail_user = role_detail_dict.__len__() - 1
    print "lost_rate=%f:all user num=%d:create_role_dict=%d" % (
        lost_sum / float(all_detail_user), all_detail_user,
        create_role_dict.__dict__())
    sc.stop()
Beispiel #28
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Beispiel #29
0
def TrainLRModel(trainData, iterations, step,
                 miniBatchFraction):  # Logistic Regression
    srcFeatures = trainData.map(lambda line: line.features)
    print srcFeatures.first()
    scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures)
    srcLabel = trainData.map(lambda line: line.label)
    scaledFeature = scaler.transform(srcFeatures)
    print scaledFeature.first()
    scaledData = srcLabel.zip(scaledFeature)
    trainData = scaledData.map(
        lambda (label, features): LabeledPoint(label, features))
    model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \
                                            miniBatchFraction = miniBatchFraction)
    return model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize,
                       miniBatchFraction):
    startTime = time()
    model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize,
                                            miniBatchFraction)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "訓練評估:使用參數" + \
                " numIterations="+str(numIterations) +\
                " stepSize="+str(stepSize) + \
                " miniBatchFraction="+str(miniBatchFraction) +\
                 " 所需時間="+str(duration) + \
                 " 結果AUC = " + str(AUC)
    return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
Beispiel #31
0
def TrainEvaluateModel(trainData, validationData, numIterations, stepSize,
                       miniBatchFraction):
    startTime = time()
    model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize,
                                            miniBatchFraction)
    AUC = EvaluateModel(model, validationData)
    duration = time() - startTime
    print("Evaluate the model: use the params: " + \
         "numIterations=" + str(numIterations) + \
         " stepSize" + str(stepSize) + \
         " miniBatchFraction=" + str(miniBatchFraction) + "\n" + \
         "====> duration time = " + str(duration) + \
         " result AUC = " + str(AUC))
    return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
def trainEvaluationModel(trainData, validationData, numIterations, stepSize, maxBatchFraction):
    startTime = time()
    # numIterations:使用随机梯度下降法的迭代次数
    # stepSize:梯度下降的步长
    # maxBatchFraction:每次迭代参数计算的样本比例,数值在0~1之间,默认为1
    model = LogisticRegressionWithSGD.train(trainData, numIterations, stepSize, maxBatchFraction)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:使用参数 " + \
         " numIterations = " + str(numIterations) + \
         " stepSize = " + str(stepSize) + \
         " maxBatchFraction = " + str(maxBatchFraction) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 AUC = " + str(AUC))
    return AUC, duration, numIterations, stepSize, maxBatchFraction, model
def trainEvaluateModel(trainData, validationData, numIterations, stepSize,
                       miniBatchFraction):
    starttime = time()
    # 方法过时,记得换新方法
    model = LogisticRegressionWithSGD.train(
        data=trainData,
        iterations=numIterations,
        step=stepSize,
        miniBatchFraction=miniBatchFraction)
    AUC = evaluateModel(model, validationData)
    duration = time() - starttime
    print("训练评估使用参数:\n", "numIterations=", numIterations, "\n stepSize=",
          stepSize, "\n miniBatchFraction=", miniBatchFraction, "====>用时=",
          duration, "\n 结果AUC=", AUC)
    return (AUC, duration, numIterations, stepSize, miniBatchFraction, model)
Beispiel #34
0
def getLogisticRegressionModel(Train_Data):

    numIters = 10
    stepSize = 10.
    regParam = 1e-6
    regType = 'l2'
    includeIntercept = True

    return LogisticRegressionWithSGD.train(data=Train_Data,
                                           iterations=numIters,
                                           miniBatchFraction=0.1,
                                           step=stepSize,
                                           regParam=regParam,
                                           regType=regType,
                                           intercept=includeIntercept)
def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and
                                   line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data,
                                                                iterations=100)

    labels_and_preds = parsed_data.map(
        lambda lp: [lp.label,
                    logisticRegressionWithSGD.predict(lp.features)])
    Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])
                                       ).count() / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open(
        '/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv',
        'w',
        encoding='utf-8')
    file.write('INDEX,GENDER\n')
    for data in unseen_parsed_data.collect():
        file.write(
            str(data[0]) + ',' +
            str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n')
    # print(labels_and_preds.collect())

    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
def getLogisticRegressionModel(Train_Data):  
    
    numIters = 10
    stepSize = 10.
    regParam = 1e-6
    regType = 'l2'
    includeIntercept = True
    
    
    return LogisticRegressionWithSGD.train(data = Train_Data,
                                   iterations = numIters,
                                   miniBatchFraction=0.1,
                                   step = stepSize,
                                   regParam = regParam,
                                   regType = regType,
                                   intercept = includeIntercept)
def logisticRegression(trainingRDD, trainingRDDHashed,
                       testRDDHashed, iterations, minibatch, stepsize):
    # Train a Naive Bayes Model
    trainedModel = LogisticRegressionWithSGD.train(
        trainingRDD,
        iterations=iterations,
        miniBatchFraction=minibatch,
        regType="l2",
        intercept=True,
        regParam=0.1,
        step=stepsize)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(
        lambda l_v24: (
            (l_v24[0],
             trainedModel.predict(
                l_v24[1])),
            1)).map(
        lambda x_y25: (
            checkState(
                x_y25[0]),
            x_y25[1])).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v26: (
            (l_v26[0],
             trainedModel.predict(
                l_v26[1])),
            1)).map(
        lambda x_y27: (
            checkState(
                x_y27[0]),
            x_y27[1])).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Logistic Regression')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
Beispiel #38
0
def main(sc):
    train_data = sc.textFile(
        "/data/scratch/vw/criteo-display-advertising-dataset/train.txt").map(
            parsePoint)
    model = LogisticRegressionWithSGD.train(train_data,
                                            iterations=1000,
                                            miniBatchFraction=0.0001,
                                            step=.001,
                                            regType="l2")

    valid_data = sc.textFile("input/valid_data.txt").map(parsePoint)
    labelsAndPreds = valid_data.map(
        lambda p: (float(model.predict(p.features)), p.label))
    Accuracy = labelsAndPreds.filter(
        lambda (pred, lab): lab == pred).count() / float(valid_data.count())
    FP = labelsAndPreds.filter(lambda
                               (pred, lab): lab == 0 and pred == 1).count()
    N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count())
    FPR = FP / N
    output = "Accuracy valid = " + str(Accuracy) + "\nFPR valid = " + str(FPR)
    print output
    metrics = BinaryClassificationMetrics(labelsAndPreds)
    output += "Area under ROC valid = " + str(metrics.areaUnderROC)

    print output

    test_data = sc.textFile(
        "/data/scratch/vw/criteo-display-advertising-dataset/test.txt").map(
            parsePoint)
    labelsAndPreds = test_data.map(lambda p:
                                   (float(model.predict(p.features)), p.label))
    Accuracy = labelsAndPreds.filter(
        lambda (pred, lab): lab == pred).count() / float(test_data.count())
    FP = labelsAndPreds.filter(lambda
                               (pred, lab): lab == 0 and pred == 1).count()
    N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count())
    FPR = FP / N
    output += "\nAccuracy test = " + str(Accuracy) + "\nFPR test = " + str(FPR)
    print output
    metrics = BinaryClassificationMetrics(labelsAndPreds)
    output += "Area under ROC test = " + str(metrics.areaUnderROC)

    print output

    output = sc.parallelize([output])
    output.saveAsTextFile("str")
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
Beispiel #40
0
def predict_LogisticRegressionWithSGD(iterations, step, regParam, regType):
    """
    LogisticRegressionWithLBFGS.train(data, iterations=100, initialWeights=None, regParam=0.0, regType='l2', intercept=False, corrections=10, tolerance=1e-06, validateData=True, numClasses=2)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations
    corrections: the number of corrections used in the LBFGS update. if a known updater is used for binary classification, it calls the ml implementation and this parameter will have no effect. default 10
    tolerance: the convergence tolerance of iterations for L-BFGS
    numClasses: the number of classes (i.e., outcomes) a label can take in Multinomial logistic regression, default 2
    """
    lrModel = LogisticRegressionWithSGD.train(scaledData,
                                              iterations=iterations,
                                              step=step,
                                              regParam=regParam,
                                              regType=regType)
    lrMetrics = scaledData.map(lambda p:
                               (p.label, lrModel.predict(p.features)))
    lrAccuracy = lrMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    return lrAccuracy
def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100)

    labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)])
    Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w',
                encoding='utf-8')
    file.write('INDEX,GENDER\n')
    for data in unseen_parsed_data.collect():
        file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n')
    # print(labels_and_preds.collect())



    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
Beispiel #42
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
def anom_with_lr():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from the unlabeled 100K
    sqlContext.registerFunction("my_random", lambda x: x - x + random())
    sqlContext.registerDataFrameAsTable(benign, "benign")
    benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign")
    
    threshold = 50000/n_benign
    into_model = benign.filter(benign.random_number <= threshold)
    for_finding_more = benign.filter(benign.random_number > threshold)
    
    for_modeling = anom.unionAll(into_model.drop(into_model.random_number))
    for_finding_more = for_finding_more.drop(for_finding_more.random_number)
    #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among
    #the selected 10,000, have probabilities around 0.05
    
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) 
            + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    all_columns = for_modeling.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]

    #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect()
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    for_modeling = for_modeling.rdd
    print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    t0 = time()
    #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds
    model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) 
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds
    
    labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features)))
    test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size)

    fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282
    
    model.clearThreshold()
    for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK
    for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK
    
    try:
      for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD.
      #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000
      #for_finding_more.take(5)
    except EOFError:
      print("EOF handled")
      
    df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom'])
    df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement.
    df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has 
    #probability of 0.86818, last one has probability 0.5928958
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return for_finding_more
Beispiel #44
0
    return log_loss


# In[10]:

# try fixed hyperparameters
numIters = 500
stepSize = 1
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(rawTrainData,
                                         iterations=numIters, 
                                         step=stepSize, 
                                         miniBatchFraction=1.0, 
                                         initialWeights=None, 
                                         regParam=regParam, 
                                         regType=regType, 
                                         intercept=includeIntercept)
print model0.weights, model0.intercept


# In[11]:

classOneFracTrain = (rawTrainData.map(lambda x: x.label)
                                 .reduce(lambda x, y: x+y))/rawTrainData.count()
print classOneFracTrain

logLossTrBase = (rawTrainData.map(lambda x: x.label)
                             .map(lambda x: computeLogLoss(classOneFracTrain, x))
                             .reduce(lambda x, y: x+y))/rawTrainData.count()
Beispiel #45
0
		label = 0	
	values = [x if x < genre else x-1 for x in values] #shift the attributes by one index
	ones = []
	ones = [1] * len(values)
	return LabeledPoint(label, SparseVector(column_num-1, values, ones))


#set hdfs path
data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*")
data = sc.sequenceFile("hdfs://localhost:9000/test/*")

parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v)))
parsedData.cache()

#Calculate total number of columns in the dataset
column_num = parsedData.flatMap(lambda _ : _ ).distinct().count()
column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect()
column_id.sort()

#choose a genre to test, default is 100th column as target variable
genre = 1

sortedData = parsedData.map(sortPoint)

labeledData = sortedData.map(lambda line : (line, genre)).map(labelData)

LRSGDmodel = LogisticRegressionWithSGD.train(labeledData)	

print LRSGDmodel.weights

Beispiel #46
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Beispiel #47
0
    print(BASE_DATA_PATH)

    conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # read the dataset
    df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load(
        BASE_DATA_PATH + '/test.csv')

    training = df_test.map(lambda row: LabeledPoint(row.IsClick,
                                                    [float(row.SearchID), float(row.AdID), float(row.Position),
                                                     float(row.HistCTR), float(row.Price)]))

    (trainingData, testData) = training.randomSplit([0.7, 0.3])

    model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4)



    # Build the model
    model1 = SVMWithSGD.train(trainingData, iterations=100)




    # Evaluate the model on training data


    model2 = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
OHETrainData = rawTrainData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create train labeled points
OHETrainData.cache() ##cache

OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats)) ##create validation labeled points
OHEValidationData.cache()

# running first model with fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

print "-------------logistic regression with gradient descent---------"
model0 = LogisticRegressionWithSGD.train(data=OHETrainData, iterations=numIters, step=stepSize,regParam=regParam, regType=regType, intercept=includeIntercept) ##train model
sortedWeights = sorted(model0.weights)
print "------------/logistic regression with gradient descent---------"


def computeLogLoss(p, y):
   
    epsilon = 10e-12
    if (p==0):
      p = p + epsilon
    elif (p==1):
      p = p - epsilon
      
    if y == 1:
      z = -log(p)
    elif y == 0:
            .map(lambda lp: len(lp.features.indices))
            .sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')


# CTR预估和对数损失函数评估,引用MLlib API

from pyspark.mllib.classification import LogisticRegressionWithSGD

numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(OHETrainData,iterations=numIters,step=stepSize,regParam=regParam,regType=regType,intercept=includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept

Test.assertTrue(np.allclose(model0.intercept,  0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
                [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
                 -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')


# log损失
from math import log

def computeLogLoss(p, y):
    epsilon = 10e-12
    if y == 1 :
            .map(lambda lp: len(lp.features.indices))
            .sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')


# ** CTR prediction and logloss evaluation **
from pyspark.mllib.classification import LogisticRegressionWithSGD

# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept


# TEST Logistic regression
Test.assertTrue(np.allclose(model0.intercept,  0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
                [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
                 -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')


# ** Log loss **
from math import log

def computeLogLoss(p, y):
table1 = sc.textFile("/user/team322/junli_testFeature/*")
def f1(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	userID = line.split(',')[0]
	return userID
user = table1.map(f1).collect() #select the users of validation data
result6 = sc.textFile("/user/team322/junli_trainFeature/*")
# Load and parse the data
def parsePoint(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	line = line.split(',')
	values = [float(x) for x in line[2:]] #select label Column and features Columns 
	return LabeledPoint(values[0], values[1:])
parsedData = result6.map(parsePoint)
# Build the model
model = LogisticRegressionWithSGD.train(parsedData)
result7 = sc.textFile("/user/team322/junli_testFeature/*")
def testParsePoint(line):
	line = str(line).replace('(','').replace(')','').replace('None','0')
	line = line.split(',')
	values = [float(x) for x in line[1:]] #select label Column and features Columns
	return LabeledPoint(values[0], values[1:])
parsedData2 = result7.map(testParsePoint)
preds = parsedData2.map(lambda p: model.predict(p.features)) #use the model to predict parsedData2
preds = preds.collect() #translate the result of predict into list
userID = []
for i in xrange(len(preds)): #select users whose predict is 1
	if preds[i] == 1:
		userID.append(user[i])
sc.parallelize(userID).saveAsTextFile('/user/team322/solution_v') #create a parallelized collection and save it 
t2 = time.ctime()
Beispiel #52
0
all_types = []
for i in [str(i) for i in title.split(",")]:
    schema = all_types.append(StructField(i, StringType(), True))
    schema = StructType(all_types)
from pyspark.sql import Row
from pyspark.mllib.classification import LogisticRegressionWithSGD
from numpy import array
from pyspark.mllib.regression import LabeledPoint

D = 2 ** 24


def helper1(r):
    features = []
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_" + str(i) + fe[i]))) % D)
        target = float(r[-1])
        ID = float(r[0])
        return LabeledPoint(target, features)
    except:
        return LabeledPoint(0.0, [0.0] * 1932)


new_rdd = rdd.filter(lambda i: len(i) == 1934)
df = new_rdd.map(helper1)

model = LogisticRegressionWithSGD.train(df)
df.take(1)
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD


def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = [float(s) for s in line.split(' ')]
    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
        values[0] = 0
    return LabeledPoint(values[0], values[1:])


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: logistic_regression <file> <iterations>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonLR")
    points = sc.textFile(sys.argv[1]).map(parsePoint)
    iterations = int(sys.argv[2])
    model = LogisticRegressionWithSGD.train(points, iterations)
    print("Final weights: " + str(model.weights))
    print("Final intercept: " + str(model.intercept))
    sc.stop()
Beispiel #54
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
from pyspark.mllib.regression import LabeledPoint
from numpy import array
import parse
# Load and parse the data

#def parsePoint(line):   # Creating vector(array) with first input as y and others as xi's   
#    values = [float(x) for x in line.split(',')]
#    return LabeledPoint(values[10], values[0:9])


sc = SparkContext("local[4]", "Logistic Regression")      #Initialized SparkContext
data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data")  #Created an RDD
parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features)

# Build the model
model = LogisticRegressionWithSGD.train(parsedData)   #Pass an RDD to "train" method of class LogisticRegressionwithSGD
#Use model to create output
#model.predict().collect()    # in "predict" method we have to pass an array
#Read Test data

Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test")
parsedTestData = Testdata.map(parse.parsePoint)
#predict result for each Test Data

# Evaluating the model on training data

labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features)))  #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint) 
millis2 = int(round(time.time() * 1000))

print labelsAndPreds.collect()
#Print testing Error
Beispiel #56
0
cutoff = float(nrock) / (nrock + nxrock)

# recombine
equalSampleData = labeledRock.union(labeledNotRock)


equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True)

# split data
trainData, testData = randomSplit(equalSampleData, [0.9, 0.1])

trainData.map(lambda p: (p.label, p.features)).take(3)

# train model
model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000)
# model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000)
# model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True)

# evaluate model
# labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0))
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))

accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count())

guess1 = labelsAndPreds.filter(lambda (v, p): p == 1)
precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count())

act1 = labelsAndPreds.filter(lambda (v, p): v == 1)
recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
Beispiel #57
0
 def train(self, num_iterations=10):
     model = LogisticRegressionWithSGD.train(
         self._labeled_feature_vector_rdd(), 
         num_iterations)
     return LogisticRegressionModel(model, self.feature_cols)
Beispiel #58
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt")
normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt")
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
normalFeatures = normal.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (normal) examples.


positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features))
trainingData = positiveExamples.union(negativeExamples)
trainingData.cache() # Cache since Logistic Regression is an iterative algorithm.
# Run Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Test on a positive example (spam) and a negative one (normal). We first apply
# the same HashingTF feature transformation to get vectors, then apply the model.
posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))
print "Prediction for positive test example: %g" % model.predict(posTest)
print "Prediction for negative test example: %g" % model.predict(negTest)

Beispiel #59
0
	splits = parsedData.randomSplit((0.9, 0.1))
	train_set = splits[0]
	train_set.cache()
	test_set = splits[1]
	test_set.cache()
	#NBmodel = NaiveBayes.train(train_set)
	#NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	#findCoveragePercent(NB_socredLabel, 0.4)
	SVMSGDmodel = SVMWithSGD.train(train_set)
	SVMSGDmodel.clearThreshold()
	SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0))
	LRSGDmodel = LogisticRegressionWithSGD.train(train_set)	
	LRSGDmodel.clearThreshold()
	LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0))
	LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set)
	LRLBFGSmodel.clearThreshold()
	LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0))

def getAccumulatedPercentage(socredLabel):
	result = []
	total = socredLabel.sum()