Esempio n. 1
0
def cross_validation_lr(Data_1,Data_2,Data_3,regType, num_iter):
    # Training the model using Logistic Regression Classifier
    model_train_1 =LogisticRegressionWithLBFGS.train(Data_1.union(Data_2),
                                                     regType =regType, iterations=num_iter, numClasses=5)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v +0.5 - p) * (v +0.5- p )).sum() /\
        float(Data_3.count())


    model_train_2 =LogisticRegressionWithLBFGS.train(Data_2.union(Data_3),
                                                     regType =regType, iterations=num_iter, numClasses=5)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v +0.5- p) * (v +0.5- p )).sum() /\
        float(Data_1.count())


    model_train_3 =LogisticRegressionWithLBFGS.train(Data_3.union(Data_1),
                                                     regType =regType, iterations=num_iter, numClasses=5)


    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v +0.5- p ) * (v +0.5- p)).sum() /\
        float(Data_2.count())

    return (testMSE_1+testMSE_2+testMSE_3)/3
def training(model_directory, libsvm, scaler):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    training_rdd = MLUtils.loadLibSVMFile(sc, libsvm)
    training_rdd.cache()
    if scaler == '1':
        label = training_rdd.map(lambda x: x.label)
        features = training_rdd.map(lambda x: x.features)

        scaler1 = StandardScaler().fit(features)
        data1 = label.zip(scaler1.transform(features))
        # convert into labeled point
        data2 = data1.map(lambda x: LabeledPoint(x[0], x[1]))
        model_logistic = LogisticRegressionWithLBFGS.train(data2)
    else:
        model_logistic = LogisticRegressionWithLBFGS.train(training_rdd)
    model_logistic.save(sc, model_directory)
Esempio n. 3
0
    def train(self, df, target, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
            y_train = spark_df.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(
                train_data,
                numClasses=numOfClasses,
                regParam=0,
                regType=regularization,
                intercept=True,
                iterations=num_of_iterations,
                validateData=False)

            self.model = logistic_model

        except Exception as e:
            raise e
Esempio n. 4
0
def lrTest(sqlContext,dataset_rdd,positive_negotive_rate):
	dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
	dataset_negotive =  dataset_rdd.filter(lambda e:e[1]<0.5)
	train_positive = dataset_positive.sample(False,0.8)
	test_positive = dataset_positive.subtract(train_positive)
	train_negotive = dataset_negotive.sample(False,0.8)
	test_negotive = dataset_negotive.subtract(train_negotive)
	trainset_rdd = train_positive.union(train_negotive)
	testset_rdd = test_positive.union(test_negotive)
	trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	trainset_nums = trainset.count()
	testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	testset_nums = testset.count()
	trainset_positive = train_positive.count()
	testset_positive = test_positive.count()
	model = LogisticRegressionWithLBFGS.train(trainset,iterations = 100)
	predict = testset.map(lambda p:(p.label,model.predict(p.features)))
	hitALL =predict.filter(lambda e:e[0]==e[1]).count()
	hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
	positive = predict.filter(lambda e:e[1]>0.5).count()
	recallPositive = hitPositive/float(testset_positive)
	precision = hitPositive/float(positive)
	accuracy = hitALL/float(testset.count())
	F_Value = 2/(1/precision+1/recallPositive)
	return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
Esempio n. 5
0
def seg_model_lr(train_data, test_data, regType, num_iter):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Logistic regression Classifier
    model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5),
                                                    regType =regType, iterations=num_iter, numClasses=5)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
def main():
    training_rdd = sc.textFile(train_inputs).map(to_LP_training).filter(lambda lp: lp!=None)
    testing_rdd = sc.textFile(test_inputs).map(to_LP_testing).filter(lambda lp: lp!=None)

    # # Logistic Regression with SGD
    # lg_model = LogisticRegressionWithSGD.train(training_rdd, step = 0.1, regType = 'l1')
    # lg_prediction = testing_rdd.map(lambda (qt, sv): (qt, lg_model.predict(sv)))
    #
    #
    # Logistic Regression with LBFGS
    lg_model2 = LogisticRegressionWithLBFGS.train(training_rdd)
    lg_prediction2 = testing_rdd.map(lambda (qt, sv): (qt, lg_model2.predict(sv)))
    #
    #
    # # SVM with SGD
    # svm_model = SVMWithSGD.train(training_rdd, step = 0.01)
    # svm_prediction = testing_rdd.map(lambda (qt, sv): (qt, svm_model.predict(sv)))


    # print 'Logistic Regression with SGD results: ', len(lg_prediction.filter(lambda (idx, p):p!=0).collect())
    result = lg_prediction2.collect()
    # print 'SVM with SGD', len(svm_prediction.filter(lambda (idx, p):p!=0).collect())

    with open('[your result.csv path]', 'w') as csvfile:
        fieldnames = ['QuoteNumber', 'QuoteConversion_Flag']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for l in result:
            writer.writerow({'QuoteNumber': l[0], 'QuoteConversion_Flag': l[1]})
    def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            train, test = spark_df.randomSplit([train_split, test_split], seed=1000000)

            X_train = train.select(*feature_columns).map(lambda x: list(x))
            y_train = train.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))

            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)

            X_test = test.select(*feature_columns).map(lambda x: list(x))
            y_test = test.select(target).map(lambda x: x[0])

            prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp))))
            prediction_and_label = prediction.zip(y_test)

            LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean())
        except Exception as e:
            raise e
Esempio n. 8
0
def logistic_regression(sc, in1, **params):
    temp = in1.map(lambda x: LabeledPoint(
        x[int(params['label'])], x[:int(params['label'])] + x[int(params[
            'label']) + 1:]))
    temp = LogisticRegressionWithLBFGS.train(
        temp,
        iterations=int(params['iterations']),
        numClasses=int(params['numClasses']))
    return True, temp
Esempio n. 9
0
 def train(self, input_data, parameters):
     iterations = parameters.get('iterations', None)
     weights = parameters.get('weights', None)
     intercept = parameters.get('intercept', None)
     numFeatures = parameters.get('numFeatures', None)
     numClasses = parameters.get('numClasses', None)
     data = self._sc.parallelize(self._parser.parse(input_data))
     self._model = LogisticRegressionWithLBFGS.train(data,\
      iterations=iterations,\
      numClasses=numClasses)
Esempio n. 10
0
def RunLogit(tf):
	rdd = tf.map(parseAsLabeledPoints)
	train, test = rdd.randomSplit([.8, .2])
	numCat = len(genCats)
	model = LogisticRegressionWithLBFGS.train(train, numClasses=numCat, iterations=100)
	predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
	accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

	print 'Accuracy of Logit = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
Esempio n. 11
0
def get_error(training, test):

    model = LogisticRegressionWithLBFGS.train(training, numClasses=18)

    # Evaluating the model on training data
    labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
    ERR = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        test.count())
    print("Training Error = " + str(ERR))
    return ERR
Esempio n. 12
0
def main():
    conf = SparkConf().setMaster("local").setAppName("Assignment 1")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    sc.setLogLevel("ERROR")
    #part 1
    data = sc.textFile('/home/disha/Downloads/MSD.txt', 2)
    dc = data.count()
    #print data.count()
    #print data.take(40)
    sdata = data.take(40)
    #part 2
    lp = [parse_line(p) for p in sdata]
    #part 3
    x1 = list(lp[i].features[3] for i in range(40))
    x2 = list(lp[i].features[4] for i in range(40))
    dataFrame = sqlContext.createDataFrame([(Vectors.dense(x1), ),
                                            (Vectors.dense(x2), )],
                                           ["features"])
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(dataFrame)
    scaledData = scalerModel.transform(dataFrame)
    x = scaledData.select("scaledFeatures").map(list).collect()
    xdf = pd.DataFrame({'1': x[0][0], '2': x[1][0]})
    '''
  fig, ax = plt.subplots()
  heatmap = ax.pcolor(xdf, cmap=plt.cm.Greys, alpha=0.8)
  fig = plt.gcf()
  fig.set_size_inches(8, 11)
  ax.set_frame_on(False)
  ax.invert_yaxis()
  ax.xaxis.tick_top()'''
    #plt.show()
    #part 4
    onlyLabels = data.map(parse_line).map(
        lambda point: int(point.label)).collect()
    minYear = min(onlyLabels)
    maxYear = max(onlyLabels)
    print maxYear, minYear
    lp_rdd = data.map(parse_line).map(
        lambda l: LabeledPoint(int(l.label) - minYear, l.features))
    #print lp_rdd.take(10)
    #part 5
    train, test = lp_rdd.randomSplit([0.8, 0.2])
    model = LogisticRegressionWithLBFGS.train(train,
                                              iterations=10,
                                              numClasses=maxYear - minYear + 1)
    vp = test.map(lambda p: (model.predict(p.features), p.label))
    rmse = getrmse(vp)
    print rmse
    a1 = test.map(lambda p: model.predict(p.features)).collect()
    a2 = test.map(lambda p: int(p.label)).collect()
    plt.scatter(a1, a2)
    plt.show()
Esempio n. 13
0
def train_model(training_rdd, **kwargs):
    """
    Train a classifier model using  an rdd training dataset
    :param training_rdd: the rdd of the training dataset
    :param kwargs: additional key-value params for the training (if any)
    :return:
    """
    return LogisticRegressionWithLBFGS.train(training_rdd,
                                             regType=_REGULARIZATION,
                                             intercept=_INTERCEPT,
                                             **kwargs)
Esempio n. 14
0
def train_model(training_rdd, **kwargs):
    """
    Train a classifier model using  an rdd training dataset
    :param training_rdd: the rdd of the training dataset
    :param kwargs: additional key-value params for the training (if any)
    :return:
    """
    return LogisticRegressionWithLBFGS.train(training_rdd,
                                             regType=_REGULARIZATION,
                                             intercept=_INTERCEPT,
                                             **kwargs)
def regression(reg_data):
    (trainingData, testData) = reg_data.randomSplit([0.7, 0.3])
    lrmodel = LogisticRegressionWithLBFGS.train(trainingData)
    labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features)))

    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
    falsePos = labelsAndPreds.filter(lambda (v, p): v != p and v == 0.0).count() / float(testData.filter(lambda lp: lp.label == 0.0).count())
    falseNeg = labelsAndPreds.filter(lambda (v, p): v != p and v == 1.0).count() / float(testData.filter(lambda lp: lp.label == 1.0).count())

    print "*** Error Rate: %f ***" % trainErr
    print "*** False Positive Rate: %f ***" % falsePos
    print "*** False Negative Rate: %f ***" % falseNeg
Esempio n. 16
0
    def __init__(self, sc):
        """Init the engine and train the model
        """

        logger.info("Starting up the GeneLearn Engine: ")

        self.sc = sc

        logger.info("Loading training data...")
        dataset_path = "/Users/qingpeng/Dropbox/Development/Bitbucket/jgi-genelearn/scripts/Flask"
        training_file_path = os.path.join(dataset_path, 'training.svmlib')
        training = MLUtils.loadLibSVMFile(sc, training_file_path)
        self.model = LogisticRegressionWithLBFGS.train(training)
Esempio n. 17
0
def TrainLRCModel(trainingData, testData):
	print(type(trainingData))
	print(trainingData.take(2))
	model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5)
	print(type(model))
	exit();
	predictions = testData.map(lambda p: (p.label, model.predict(p.features)))

	correct = predictions.filter(lambda (x, p): x == p)
	### Calculate the accuracy of the model using custom method
	accuracy = round((correct.count() / float(testData.count())) * 100, 3)
	# return the final accuracy
	return accuracy
Esempio n. 18
0
def sim_function(isim, patsim, dataset, ss_ori):

    #select patients in each simulation from patsim
    valsimid = patsim.filter(patsim.simid == isim)

    sssim = ss_ori\
        .join(valsimid,valsimid.matched_positive_id==ss_ori.matched_positive_id,'inner')\
        .select(ss_ori.matched_positive_id, ss_ori.label, ss_ori.patid,ss_ori.features)

    #select corresponding trainning and test set
    valsim = dataset\
        .join(valsimid, valsimid.matched_positive_id==dataset.matched_positive_id,
              'inner')\
        .select(dataset.matched_positive_id, dataset.label, dataset.patid,dataset.features)

    trsim = dataset.subtract(valsim)

    #get LabeledandPoint rdd data
    trsimrdd = trsim.map(parsePoint)
    valsimrdd = valsim.map(parsePoint)
    sssimrdd = sssim.map(parsePoint)

    # Build the model
    sim_model = LogisticRegressionWithLBFGS.train(trsimrdd,
                                                  intercept=True,
                                                  regType=None)
    #clear the threshold
    sim_model.clearThreshold()

    #output model
    sim_model.save(sc, resultDir_s3 + "model_sim" + str(isim))
    #load model
    #model = LogisticRegressionModel.load(sc, resultDir_s3+"model_sim"+str(isim))

    #predict on test data
    scoreAndLabels_val = valsimrdd.map(
        lambda p: (float(sim_model.predict(p.features)), p.label))
    scoreAndLabels_ss = sssimrdd.map(
        lambda p: (float(sim_model.predict(p.features)), p.label))

    #Identify the probility of response
    pred_score_val = scoreAndLabels_val.toDF()\
        .withColumnRenamed('_1', 'prob_1')\
        .withColumnRenamed('_2', 'label')

    pred_score_ss = scoreAndLabels_ss.toDF()\
        .withColumnRenamed('_1', 'prob_1')\
        .withColumnRenamed('_2', 'label')

    return [pred_score_val, pred_score_ss]
Esempio n. 19
0
def validation_lr(trainingData,testData, regType, num_iter):
    # Training the model using Logistic Regression Classifier

    model_train =LogisticRegressionWithLBFGS.train(trainingData, regType =regType, iterations=num_iter, numClasses=5)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))

    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE_1,testMSE
def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[4] != '' and
                                   line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[4] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithLBFGS = LogisticRegressionWithLBFGS.train(
        parsed_data, iterations=500, numClasses=100)

    labels_and_preds = parsed_data.map(
        lambda lp:
        [lp.label, logisticRegressionWithLBFGS.predict(lp.features)])
    Accuracy = float(
        labels_and_preds.filter(lambda ele: (int(ele[0]) - int(ele[1]))**2).
        reduce(lambda x, y: x + y)[0]) / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open(
        '/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result2.csv',
        'w',
        encoding='utf-8')
    file.write('INDEX,AGE\n')
    for data in unseen_parsed_data.collect():
        file.write(
            str(data[0]) + ',' +
            str(logisticRegressionWithLBFGS.predict(data[1])) + '\n')
    # print(labels_and_preds.collect())

    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
Esempio n. 21
0
def train_evaluate_model(train_data, valid_data, iterations, regParam):
    start_time = time()
    # 训练
    model = LogisticRegressionWithLBFGS.train(train_data,
                                              numClasses=2,
                                              iterations=iterations,
                                              regParam=regParam)
    # 评估
    # y_pred y_true
    AUC = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(
        f"训练评估:使用参数 iterations={iterations}, regParam={regParam} ==>所需时间={duration} 结果AUC = {AUC}"
    )
    return AUC, duration, iterations, regParam, model
Esempio n. 22
0
def regression(reg_data):
    
    train_data, test_data = reg_data.randomSplit([0.7, 0.3])
    model = LogisticRegressionWithLBFGS.train(train_data)
    labels_predictions = test_data.map(lambda p: (p.label, model.predict(p.features)))

    train_error = labels_predictions.filter(lambda (v, p): v != p).count() / float(test_data.count())
    false_pos = labels_predictions.filter(lambda (v, p): v != p and v == 0.0).count() / float(
        test_data.filter(lambda lp: lp.label == 0.0).count())
    false_neg = labels_predictions.filter(lambda (v, p): v != p and v == 1.0).count() / float(
        test_data.filter(lambda lp: lp.label == 1.0).count())

    print "*** Error Rate: %f ***" % train_error
    print "*** False Positive Rate: %f ***" % false_pos
    print "*** False Negative Rate: %f ***" % false_neg
Esempio n. 23
0
def main():

    #parameters
    num_features = 400  #vocabulary size

    #load data
    print "loading 20 newsgroups dataset..."
    categories = [
        'rec.autos', 'rec.sport.hockey', 'comp.graphics', 'sci.space'
    ]
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=0,
                                 categories=categories,
                                 remove=('headers', 'footers', 'quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    train_labels = dataset.target
    toc = time()
    print "elapsed time: %.4f sec" % (toc - tic)

    #tf-idf vectorizer
    tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \
                            min_df=2, stop_words='english', use_idf=True)
    X_tfidf = tfidf.fit_transform(train_corpus).toarray()

    #append document labels
    train_labels = train_labels.reshape(-1, 1)
    X_all = np.hstack([train_labels, X_tfidf])

    #distribute the data
    sc = SparkContext('local', 'log_reg')
    rdd = sc.parallelize(X_all)
    labeled_corpus = rdd.map(parse_doc)
    train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0)

    #distributed logistic regression
    print "training logistic regression..."
    model = LogisticRegressionWithLBFGS.train(train_RDD,
                                              regParam=1,
                                              regType='l1',
                                              numClasses=len(categories))

    #evaluated the model on test data
    labels_and_preds = test_RDD.map(lambda p:
                                    (p.label, model.predict(p.features)))
    test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float(
        test_RDD.count())
    print "log-reg test error: ", test_err
def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	return lrModel, nbModel, labeled_points_test
Esempio n. 25
0
def regression(reg_data):
    (trainingData, testData) = reg_data.randomSplit([0.7, 0.3])
    lrmodel = LogisticRegressionWithLBFGS.train(trainingData)
    labelsAndPreds = testData.map(lambda p:
                                  (p.label, lrmodel.predict(p.features)))

    trainErr = labelsAndPreds.filter(lambda v, p: v != p).count() / float(
        testData.count())
    falsePos = labelsAndPreds.filter(lambda v, p: v != p and v == 0.0).count(
    ) / float(testData.filter(lambda lp: lp.label == 0.0).count())
    falseNeg = labelsAndPreds.filter(lambda v, p: v != p and v == 1.0).count(
    ) / float(testData.filter(lambda lp: lp.label == 1.0).count())

    print("*** Error Rate: %f ***" % trainErr)
    print("*** False Positive Rate: %f ***" % falsePos)
    print("*** False Negative Rate: %f ***" % falseNeg)
def trainModel(lpRDD):
    """ Train 3 classifier models on the given RDD with LabeledPoint objects. A list of trained model is returned. """
    lpRDD.persist(
        StorageLevel.MEMORY_ONLY
    )  # not really needed as the Spark implementations ensure caching themselves. Other implementations might not, however.
    # Train a classifier model.
    print('Starting to train the model')  #give some immediate feedback
    model1 = LogisticRegressionWithLBFGS.train(lpRDD)  # this is the best model
    print('Trained LR (model1)')
    #print(type(model1))
    model2 = NaiveBayes.train(lpRDD)  # doesn't work well
    print('Trained NB (model2)')
    print(type(model2))
    model3 = SVMWithSGD.train(lpRDD)  # or this ...
    print('Trained SVM (model3)')
    return [model1, model2, model3]
Esempio n. 27
0
def main():
    #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    print "Session created!"
    sc = spark.sparkContext
    print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId

    print sys.argv
    sampleHDFS_1 = sys.argv[1]
    sampleHDFS_2 = sys.argv[2]
    outputHDFS = sys.argv[3]

    sampleRDD = sc.textFile(sampleHDFS_1).map(parse)
    predictRDD = sc.textFile(sampleHDFS_2).map(lambda x: parse(x, True))

    # 训练
    model = LogisticRegressionWithLBFGS.train(sampleRDD)
    model.clearThreshold()  #删除默认阈值(否则后面直接输出0、1)

    # 预测,保存结果
    labelsAndPreds = predictRDD.map(
        lambda p: (p[0], p[1].label, model.predict(p[1].features)))
    labelsAndPreds.map(lambda p: '\t'.join(map(str, p))).saveAsTextFile(
        outputHDFS + "/target/output")

    # 评估不同阈值下的准确率、召回率
    labelsAndPreds_label_1 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 1)
    labelsAndPreds_label_0 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 0)
    t_cnt = labelsAndPreds_label_1.count()
    f_cnt = labelsAndPreds_label_0.count()
    print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall"
    for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        tp = labelsAndPreds_label_1.filter(lambda lp: lp[2] > thre).count()
        tn = t_cnt - tp
        fp = labelsAndPreds_label_0.filter(lambda lp: lp[2] > thre).count()
        fn = f_cnt - fp
        print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" %
              (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) /
               (t_cnt)))

    # 保存模型、加载模型
    model.save(
        sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel")
    sameModel = LogisticRegressionModel.load(
        sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel")

    print "output:", outputHDFS
def create_or_load_model(sc: SparkContext,
                         train_dataset_path: str) -> LogisticRegressionModel:
    if not os.path.exists(MODEL_PATH):
        print('training model...')
        dataset_rdd = sc.textFile(train_dataset_path)
        table_rdd = dataset_rdd.map(lambda line: line.split(','))
        labeled_features = rdd_to_feature(table_rdd)
        # labeled_features.foreach(lambda lp: print(lp))
        labeled_features.cache()
        model = LogisticRegressionWithLBFGS.train(labeled_features,
                                                  numClasses=NUM_CLASSES)
        model.setThreshold(0.5)
        model.save(sc, MODEL_PATH)
        return model
    else:
        model = LogisticRegressionModel.load(sc, MODEL_PATH)
        return model
Esempio n. 29
0
def logistic_model(sc):

    #    global conf
    #    conf.setAppName("data analyse")
    #    sc = SparkContext(conf=conf)
    #    print ("Successfully started SparkContext")
    data = sc.textFile("file://" + ROOTDIR + "/sample_svm_data.txt")
    parsedData = data.map(parsePoint)
    # Build the model
    model = LogisticRegressionWithLBFGS.train(parsedData)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        parsedData.count())
    print("Training Error = " + str(trainErr))
Esempio n. 30
0
    def test_train(self,
                   df,
                   target,
                   train_split,
                   test_split,
                   regularization=None,
                   num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            train, test = spark_df.randomSplit([train_split, test_split],
                                               seed=1000000)

            X_train = train.select(*feature_columns).map(lambda x: list(x))
            y_train = train.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))

            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(
                train_data,
                numClasses=numOfClasses,
                regParam=0,
                regType=regularization,
                intercept=True,
                iterations=num_of_iterations,
                validateData=False)

            X_test = test.select(*feature_columns).map(lambda x: list(x))
            y_test = test.select(target).map(lambda x: x[0])

            prediction = X_test.map(lambda lp:
                                    (float(logistic_model.predict(lp))))
            prediction_and_label = prediction.zip(y_test)

            LOGGER.info(
                prediction_and_label.map(lambda labelAndPred: labelAndPred[0]
                                         == labelAndPred[1]).mean())
        except Exception as e:
            raise e
Esempio n. 31
0
def Train_Model(trainingRDD, method, parameter_Iterations, parameter_stepSize,
                parameter_reqParam):
    # model load in.
    if method == 'Logistic':
        Logistic_Model = LogisticRegressionWithLBFGS.train(
            trainingRDD,
            iterations=parameter_Iterations,
            regParam=parameter_reqParam)
        return Logistic_Model
    elif method == 'SVM':
        SVM_Model = SVMWithSGD.train(trainingRDD,
                                     iterations=parameter_Iterations,
                                     step=parameter_stepSize,
                                     regParam=parameter_reqParam)
        return SVM_Model
    else:
        return "No this method."
Esempio n. 32
0
def predictions(train_data_labeled,test_data_labeled):

    time_start=time.time()
    model_lrm = LogisticRegressionWithLBFGS.train(train_data_labeled,
                iterations=100, initialWeights=None, regParam=0.01,
                regType='l2', intercept=False, corrections=10, tolerance=0.0001,
                validateData=True, numClasses=10)


    predictions = model_lrm.predict(test_data_labeled.map(lambda x: x.features))
    predict_label = test_data_labeled.map(lambda x: x.label).repartition(1).saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/stage3")
    labels_and_predictions = test_data_labeled.map(lambda x: x.label).zip(predictions)
    lrAccuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data_labeled.count())

    time_end=time.time()
    time_lrm=(time_end - time_start)
    print("=========================================================================================================")
    print("run time: {},LogisticRegression accuracy: {}".format(time_lrm,lrAccuracy))
Esempio n. 33
0
def fit_and_predict(rdd):
    '''
    Fits a logistic regression model.
    
    Parameters
    ----------
    rdd: A pyspark.rdd.RDD instance.
    
    Returns
    -------
    An RDD of (label, prediction) pairs.
    '''
    #Creates logistical regression model with 10 iterations that predicts on entire data
    model=LogisticRegressionWithLBFGS.train(rdd, iterations=10)
    #makes RDD with label and predictions
    rdd=rdd.map(lambda x: (x.label, float(model.predict(x.features))))
    
    return rdd
def logisticRegression(features,sc,output_n):
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:]

	labeled_training = []
	labeled_testing = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	for y in testing_features_labels:
		labeled_testing.append(LabeledPoint(y[0],y[1]))

	test = sc.parallelize(labeled_testing)

 	logregression_model = LogisticRegressionWithLBFGS.train(labeled_training)
 	predictions = test.map(lambda line: (line.label, float(logregression_model.predict(line.features))))
 	return predictions
Esempio n. 35
0
def TrainLRCModel(trainingData, testData):
	print(type(trainingData))
	print(trainingData.take(2))

	# Map the training and testing dataset into Labeled Point
	trainingData = trainingData.map(lambda row:[LabeledPoint(row.label,row.features)])
	print('After changing the dataset type to labeled Point')
	print(type(trainingData))
	print(trainingData.take(2))

	model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5)
	print(type(model))
	exit();
	predictions = testData.map(lambda p: (p.label, model.predict(p.features)))

	correct = predictions.filter(lambda (x, p): x == p)
	### Calculate the accuracy of the model using custom method
	accuracy = round((correct.count() / float(testData.count())) * 100, 3)
	# return the final accuracy
	return accuracy
Esempio n. 36
0
def create_model(name, training):
    if name == 'logistic':
        print_box()
        print "Logistic Regression Model"
        print_box()
        model = LogisticRegressionWithLBFGS.train(training)
    elif name == 'tree':
        print_box()
        print "Decision Tree Model"
        print_box()
        model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
    elif name == 'rf':
        print_box()
        print "Random Forest Model"
        print_box()
        model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                    numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)

    return model
def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[4] != '' and line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[4] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithLBFGS = LogisticRegressionWithLBFGS.train(parsed_data, iterations=500, numClasses=100)

    labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithLBFGS.predict(lp.features)])
    Accuracy = float(labels_and_preds.filter(lambda ele: (int(ele[0]) - int(ele[1])) ** 2).reduce(lambda x, y: x + y)[0]) / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result2.csv', 'w',
                encoding='utf-8')
    file.write('INDEX,AGE\n')
    for data in unseen_parsed_data.collect():
        file.write(str(data[0]) + ',' + str(logisticRegressionWithLBFGS.predict(data[1])) + '\n')
    # print(labels_and_preds.collect())



    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
Esempio n. 38
0
    def train(self, feat='tfidf'):
        """
        Trains a multinomal NaiveBayes classifier on TFIDF features.

        Parameters
        ---------
        Spark DataFrame with columns:
        key: (label, filepath) tuple
        tf: Term-frequency Sparse Vector.
        IDF: TFIDF Sparse Vector.

        Returns
        ---------
        model: MLLib NaiveBayesModel object, trained.
        test_score: Accuracy of the model on test dataset.
        """
        if not self.lp_path:
            self.labeled_points = self.make_labeled_points(self.extract_features())
        self.make_train_test(self.test_size)

        train_rdd = self.labeled_points.join(self.y_train) \
                        .map(lambda (key, (lp, label)): lp) \
                        .repartition(self.n_part).cache()

        if self.model_type == 'naive_bayes':
            nb = NaiveBayes()
            self.model = nb.train(train_rdd)

        elif self.model_type == 'log_reg':
            n_classes = len(self.unique_ratings())
            features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
            logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes)
            self.model = logreg

        # elif self

        return self
Esempio n. 39
0
def processData(sc):
	#load and parse the data
	raw_data = sc.textFile(DATA_FILE)
	raw_data.persist()	
	
	print "Train data size {}".format(raw_data.count()) 
	# map data to a format needed for logistic regression
	parsedData = raw_data.map(mapper)
	
	print "Sample of input to algorithm ", parsedData.take(10)
	
	# Train model
	t0 = time()	
	model = LogisticRegressionWithLBFGS.train(parsedData)
	t1 = time() - t0
	print "Classifier trained in {} seconds".format(round(t1, 3))

	labelsAndPreds = parsedData.map(lambda point: (point.label, model.predict(point.features)))
	
	# Evaluating the model on training data
	trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())

	# Print some stuff
	print("Training Error = " + str(trainErr))
Esempio n. 40
0
"""


# Imports
# The L-BFGS method approximates the objective function locally 
# as a quadratic without evaluating the second partial derivatives of the objective function to construct the Hessian matrix. 
# LogBFGS over mini-batch gradient descent for faster convergence.
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext
from numpy import array

sc = SparkContext("local", "SVM")

# Loading and parsing data
def parsePoint(line):
	vals = [float(i) for i in line.split(' ')]
	return LabeledPoint(vals[0], vals[1:])

# Sample data provided by Spark 1.3.1 folder
data = sc.textFile("jingrong/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Building the model 
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluate the model based on training data
labelAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainingError = labelAndPreds.filter(lambda (v,p): v!=p).count() / float(parsedData.count())

print "Training Error: ", str(trainingError)
Esempio n. 41
0
def logsreg(loadTrainingFilePath, sc):
	# Load training data in LIBSVM format
	loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt'
	data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath)
	
	
	# Split data into training (60%) and test (40%)
	traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels);

	print 'Completed.'
	# Instantiate metrics object
	# metrics = MulticlassMetrics(predictionAndLabels)

	# # Overall statistics
	# precision = metrics.precision()
	# recall = metrics.recall()
	# f1Score = metrics.fMeasure()
	# #confusion_matrix = metrics.confusionMatrix().toArray()

	# print("Summary Stats")
	# print("Precision = %s" % precision)
	# print("Recall = %s" % recall)
	# print("F1 Score = %s" % f1Score)


	# # Statistics by class
	# labels = traindata.map(lambda lp: lp.label).distinct().collect()
	# for label in sorted(labels):
	#     print("Class %s precision = %s" % (label, metrics.precision(label)))
	#     print("Class %s recall = %s" % (label, metrics.recall(label)))
	#     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# # Weighted stats
	# print("Weighted recall = %s" % metrics.weightedRecall)
	# print("Weighted precision = %s" % metrics.weightedPrecision)
	# print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	# print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	# print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]	

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")
Esempio n. 42
0
 def train(self, num_iterations=10):
     model = LogisticRegressionWithLBFGS.train(
         self._labeled_feature_vector_rdd(), 
         num_iterations)
     return LogisticRegressionModel(model, self.feature_cols)
Esempio n. 43
0
regParams = [1e-3]
corrections = [30]
tolerances = [1e-4]

bestReg = 0
bestCor = 0
bestTol = 0

from pyspark.mllib.classification import LogisticRegressionWithLBFGS

for reg in regParams:
    for cor in corrections:
        for tol in tolerances:

            model = LogisticRegressionWithLBFGS.train(hashedTrainData, iterations=100, initialWeights=None, regParam=reg, regType='l2',
            intercept=False, corrections=cor, tolerance=tol, validateData=True, numClasses=2)
            logLossVa = (hashedValidationData.map(lambda p: (p.label, getCTRProb(p.features, model.weights, model.intercept)))
                                             .map(lambda p: computeLogLoss(p[1], p[0]))
                                             .reduce(lambda a,b: a+b))/hashedValidationData.count()
    #        logLossVa = evaluateModel(model, hashedValidationData)
            print logLossVa, reg, cor, tol
	    if (logLossVa < bestLogLoss):
                bestModel = model
                bestLogLoss = logLossVa
                bestReg = reg
                bestCor = cor
                bestTol = tol

print bestLogLoss, bestReg, bestCor, bestTol

Esempio n. 44
0
traindays = set(traindays.collect()) # for fast searching

# read the data, filtering it to keep only traindays and non-cancels
# the header is organically removed because FL_DATE is not a trainday
#allfields = sc.textFile('gs://cloud-training-demos/flights/201501.csv') \
allfields = sc.textFile('gs://cloud-training-demos/flights/2015*.csv') \
           .map(lambda line : line.split(',')) \
           .filter(lambda fields: fields[0] in traindays and \
                                  fields[22] != '')

# these are the fields we'll use in the regression
# format is LabeledPoint(label, [x1, x2, ...]) 
flights = allfields.map(lambda fields: LabeledPoint(\
              float(float(fields[22]) < 15), #ontime \
              [ \
                  float(fields[15]), # DEP_DELAY \
                  float(fields[16]), # TAXI_OUT \
                  float(fields[26]), # DISTANCE \
              ]))

#flights.saveAsTextFile('gs://cloud-training-demos/flights/sparkoutput/train')

lrmodel = LogisticRegressionWithLBFGS.train(flights, intercept=True)
print lrmodel.weights,lrmodel.intercept

lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7

#print lrmodel.predict([36.0,12.0,594.0])

lrmodel.save(sc, 'gs://cloud-training-demos/flights/sparkoutput/model')
Err = 0.0
results = []
for train_index, test_index in ss:
	X_training, Y_training, X_test, Y_test = [], [], [], []
	for i in train_index:
		X_training.append(X[i])
		Y_training.append(Y[i])
	for i in test_index:
		X_test.append(X[i])
		Y_test.append(Y[i])
		
	parsedData = []
	for i in range(0, len(X_training)):
		parsedData.append(LabeledPoint(Y_training[i], X_training[i]))
		
	model = LogisticRegressionWithLBFGS.train(sc.parallelize(parsedData))
		
	testErr = 0
	for i in range(0, len(X_test)):
		a = Y_test[i]
		b = model.predict(X_test[i])
		#b = 1
		if a != b:
			testErr += 1
		
	Err += float(testErr) / float(len(X_test))

	 
print ("AVG test error: %.6f" % 
	(Err/iter_number))
Esempio n. 46
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
#creating RDD of reviews
review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	sentence=paragraph_to_wordlist(p[2])))

#creating the dataframe
reviewDF = sqlContext.createDataFrame(review)
#transforming the words to vectors using the trained model
transformDF = wvModel.transform(reviewDF)
#segregating the labels and features
selectData = transformDF.select("label","features","id")
#Creating RDD of LabeledPoints
lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features)))
#Spliting the data for training and test
(trainingData, testData) = lpSelectData.randomSplit([0.9, 0.1])
# training the Logistic regression with LBFGS model
lrm = LogisticRegressionWithLBFGS.train(trainingData.map(lambda x: x[1]), iterations=10)
#fetching the labels and predictions for test data
labelsAndPreds = testData.map(lambda p: (p[0],p[1].label, lrm.predict(p[1].features)))
#calculating the accuracy and printing it.
accuracy = labelsAndPreds.filter(lambda (i, v, p): v == p).count() / float(testData.count())
print("Accuracy = " + str(accuracy))

#initializing Streming context with a window of 10 secs
ssc = StreamingContext(sc, 10)
#fetching the input statement from S3
lines = ssc.textFileStream("s3://spark-sentimentanalysis/")
#calculating a wordcount
counts = lines.flatMap(lambda line: line.split(" "))\
             .map(lambda x: (x, 1))\
             .reduceByKey(lambda a, b: a+b)
Esempio n. 48
0


    # Evaluate the model on training data


    model2 = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)




    # Build the model
    model3 = LogisticRegressionWithLBFGS.train(trainingData)



    model4 = GradientBoostedTrees.trainClassifier(trainingData,
        categoricalFeaturesInfo={}, numIterations=3)


    #model.setThreshold(0.07)
    model.clearThreshold()
        # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())

        j=0
    return LabeledPoint(float(int(hashlib.md5(datapoints[3]).hexdigest(), 16)/pow(10,38)), datapoints[1:3])

working_directory = os.getcwd()
working_directory = working_directory+"/"





configuartion=py.SparkConf()                                # setting the Spark Configuration
sContext=py.SparkContext(conf=configuartion)                # setting the Spark context
sContext.defaultParallelism
data = sContext.textFile(working_directory+"Test-TrainingData_SVM.csv")
testdata = sContext.textFile("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/KL/")

print testdata.take(1)

parsedData = data.map(parsePoint)
print parsedData.take(10)
# Build the modelLogisticRegressionWithLBFGS
model = LogisticRegressionWithLBFGS.train(parsedData, iterations=10,numClasses=7)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# # Save and load model
# model.save(sc, "myModelPath")
# sameModel = SVMModel.load(sc, "myModelPath")
	
	#Cancelled becomes the 8th column now, and total columns in the data = 8
	label = clean_line_split[7]
	nonLable = clean_line_split[0:7]

	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit ([0.7, 0.3], seed=11L)
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = LogisticRegressionWithLBFGS.train (training, numClasses=3)

#evaluate the model on training data
labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features)))

#labelAndPreds = testData.map (lambda x: (x.label, model.predict (x.features)))
trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ())

print ('Time consumed = '), (datetime.now() - startTime)

print ("Training error = " + str (trainErr))

#save and load model
model.save(sc, "LRW-95-08")
sameModel = LogisticRegressionModel.load(sc, "LRW-95-08")
sc.stop ()
Esempio n. 51
0
	#NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	#findCoveragePercent(NB_socredLabel, 0.4)
	SVMSGDmodel = SVMWithSGD.train(train_set)
	SVMSGDmodel.clearThreshold()
	SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8))
	SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0))
	LRSGDmodel = LogisticRegressionWithSGD.train(train_set)	
	LRSGDmodel.clearThreshold()
	LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8))
	LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0))
	LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set)
	LRLBFGSmodel.clearThreshold()
	LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect())
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8))
	LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0))

def getAccumulatedPercentage(socredLabel):
	result = []
	total = socredLabel.sum()
	accum = 0
	for i in range(socredLabel.size):
		accum += socredLabel[i]
		result.append(accum/total)
	return result
SVM_accum = getAccumulatedPercentage(SVM_socredLabel)
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BinaryClassificationMetricsExample")
    sqlContext = SQLContext(sc)
    # $example on$
    # Several of the methods available in scala are currently missing from pyspark
    # Load training data in LIBSVM format
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")

    # Split data into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=11L)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = BinaryClassificationMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)

    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)
    # $example off$
Esempio n. 53
0
 def train(self,data,**kwargs):
     model = LogisticRegressionWithLBFGS.train(data=data,**kwargs)
     model.clearThreshold()
     self.model = model
Esempio n. 54
0
print irisTrainRDD.take(2)
print irisTestRDD.take(2)

# COMMAND ----------

# MAGIC %md
# MAGIC Now, we can use MLlib's logistic regression on our `RDD` of `LabeledPoints`.  Note that we'll use `LogisticRegressionWithLBFGS` as it tends to converge faster than `LogisticRegressionWithSGD`.

# COMMAND ----------

from pyspark.mllib.classification import LogisticRegressionWithLBFGS
help(LogisticRegressionWithLBFGS)

# COMMAND ----------

mllibModel = LogisticRegressionWithLBFGS.train(irisTrainRDD, iterations=1000, regParam=0.0)

# COMMAND ----------

# MAGIC %md
# MAGIC Let's calculate our accuracy using `RDDs`.

# COMMAND ----------

rddPredictions = mllibModel.predict(irisTestRDD.values())
predictAndLabels = rddPredictions.zip(irisTestRDD.keys())

mllibAccuracy = predictAndLabels.map(lambda (p, l): p == l).mean()
print 'MLlib model accuracy: {0:.3f}'.format(mllibAccuracy)
    def train(self, df, target, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)


            X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
            y_train = spark_df.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)


            self.model = logistic_model

        except Exception as e:
            raise e