def Load_Regression_Model():
  model_1 = LogisticRegressionModel.load(sc, "./logistc_1.model")
  model_2 = LogisticRegressionModel.load(sc, "./logistc_2.model")
  model_3 = LogisticRegressionModel.load(sc, "./logistc_3.model")

  model_list = [model_1, model_2, model_3]
  return model_list
Ejemplo n.º 2
0
 def Prediction(self, modelType):
     data_point = self.Features
     if modelType == 'RF':
         model = RandomForestModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'GBDT':
         model = GradientBoostedTreesModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRsgd':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRlbfgs':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'SVM':
         model = SVMModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     else:
         pass
Ejemplo n.º 3
0
def prediction(model_directory, libsvm_file, outputfile):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")

    model = LogisticRegressionModel.load(sc, model_directory)
    #print "numfeature",model.numFeatures
    #print "aaaaaaaa"
    vectors = MLUtils.loadLibSVMFile(sc,
                                     libsvm_file,
                                     numFeatures=model.numFeatures)
    vectors.cache()
    model.clearThreshold()
    # vector = vectors.collect()
    # for v in vector:
    #
    #     features = v.features
    #     print features
    #     print "bbbb",len(features),model.predict(Vectors.dense(features))
    # exit()
    scores = vectors.map(lambda p: (model.predict(Vectors.dense(p.features))))
    #   lambda p: (p.label, model.predict(p.features)))
    scores_list = scores.collect()
    file_out_obj = open(outputfile, 'w')
    for score in scores_list:
        #print '----->',score
        file_out_obj.write(str(score) + '\n')
    file_out_obj.close()
Ejemplo n.º 4
0
def model_instream(sc, **params):
    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration())
    if not fs.exists(
            sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) +
                                              '/model/' + params['path'])):
        raise Exception("Invalid file path, path not exists!")
    if params['type'] == 'kmeans':
        model = KMeansModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'fpgrowth':
        model = FPGrowthModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'logistic-regression':
        model = LogisticRegressionModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'word2vec':
        model = Word2VecModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    elif params['type'] == 'decision-tree':
        model = DecisionTreeModel.load(
            sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])
    else:
        raise Exception("Invalid model type!")
    return True, model
    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model
Ejemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser(description='Park or Bird Prediction Engine')
    parser.add_argument('--i','--input', type=str, required=True, default=None, help='Input file or directory of jpg images')
    parser.add_argument('--m','--method', type=str, required=True, default=None, help='Model method, 1 or 2')
    args = parser.parse_args()

    outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz'
    os.system('rm -f ' + outfile)

    sc = SparkContext(appName="Park Bird Predction Model 1")

    args.m = args.m if args.m in [1,2] else 2
    model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2'

    CreateTestData(args.i, args.m, outfile)

    raw_input = sc.textFile(outfile)
    k = raw_input.map(lambda x: x.split(',')[0])
    p = raw_input.map(lambda x: x.split(',')[1]).map(lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map(lambda x: Vectors.dense(x))

    model = LogisticRegressionModel.load(sc, model_path)
    predictions = model.predict(p)
    keyPredictions = k.zip(predictions.map(lambda x: "IT'S A BIRD!" if x==1 else "IT'S A PARK!"))

    print("************* RESULTS *******************")
    print keyPredictions.collect()

    sc.stop()
Ejemplo n.º 7
0
 def __init__(self, trained_model, *args, **kwargs):
     super(MyStreamingLogisticRegressionWithSGD,
           self).__init__(*args, **kwargs)
     self.trained_model = trained_model
     self._model = LogisticRegressionModel(
         weights=self.trained_model.weights,
         intercept=self.trained_model.intercept,
         numFeatures=self.trained_model.numFeatures,
         numClasses=self.trained_model.numClasses,
     )
Ejemplo n.º 8
0
 def __init__(self, path):
     conf = SparkConf() \
         .setAppName("crankshaw-pyspark") \
         .set("spark.executor.memory", "2g") \
         .set("spark.kryoserializer.buffer.mb", "128") \
         .set("master", "local")
     sc = SparkContext(conf=conf, batchSize=10)
     self.model = LogisticRegressionModel.load(sc, path)
     self.path = path
     print("started spark")
def main(sc):
    data = [
        LabeledPoint(0.0, [0.0, 1.0]),
        LabeledPoint(1.0, [1.0, 0.0])
        ]
    lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10)
    print (lrm.predict([1.0, 0.0]))
    print(lrm.predict([0.0, 1.0]))
    # Save and load model
    lrm.save(sc, "lrsgd")
    sameModel = LogisticRegressionModel.load(sc, "lrsgd")
    print(sameModel.predict([1.0, 0.0]))
    print(sameModel.predict([0.0, 1.0]))
Ejemplo n.º 10
0
def check(test_data: OneDigit, logistic_regression_model: LogisticRegressionModel):
    num = len(test_data.X01)
    right_num = 0
    i = 0
    for x in test_data.X01:
        p_y = logistic_regression_model.predict(x)
        y = int(test_data.y01[i])
        i += 1
        if p_y == y:
            right_num += 1
    print(right_num)
    print(num)
    print(right_num / num)
Ejemplo n.º 11
0
    def __init__(self, path):

        conf = SparkConf() \
            .setAppName("crankshaw-pyspark") \
            .set("spark.executor.memory", "2g") \
            .set("spark.kryoserializer.buffer.mb", "128") \
            .set("master", "local")
        sc = SparkContext(conf=conf, batchSize=10)
        self.model = LogisticRegressionModel.load(sc, path)
        # self.model = RandomForestModel.load(sc, path)
        self.path = path
        # path = '/Users/crankshaw/model-serving/tugboat/feature_servers/python/spark_model'
        # self.name, self.model = load_pyspark_model(path)

        print("started spark")
Ejemplo n.º 12
0
def testing_model(model_directory, libsvm, prediction, report, prc_file):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    model = LogisticRegressionModel.load(sc, model_directory)
    testing_rdd = MLUtils.loadLibSVMFile(sc, libsvm,
                                         numFeatures=model.numFeatures)
    testing_rdd.cache()
    au_prc, precision, recall, thresholds, y_true, y_scores = evaluate_model(
        testing_rdd, model)
    print 'evaluating_model done!\n'
    write_to_report(au_prc, precision, recall, thresholds, report)
    print 'write_to_report done!\n'
    write_to_prediction(y_true, y_scores, prediction)
    print 'write_to_prediction done!\n'
    draw_prc(precision, recall, prc_file, au_prc)
    print 'draw_prc done!\n'
Ejemplo n.º 13
0
def main():
    #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    print "Session created!"
    sc = spark.sparkContext
    print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId

    print sys.argv
    sampleHDFS_1 = sys.argv[1]
    sampleHDFS_2 = sys.argv[2]
    outputHDFS = sys.argv[3]

    sampleRDD = sc.textFile(sampleHDFS_1).map(parse)
    predictRDD = sc.textFile(sampleHDFS_2).map(lambda x: parse(x, True))

    # 训练
    model = LogisticRegressionWithLBFGS.train(sampleRDD)
    model.clearThreshold()  #删除默认阈值(否则后面直接输出0、1)

    # 预测,保存结果
    labelsAndPreds = predictRDD.map(
        lambda p: (p[0], p[1].label, model.predict(p[1].features)))
    labelsAndPreds.map(lambda p: '\t'.join(map(str, p))).saveAsTextFile(
        outputHDFS + "/target/output")

    # 评估不同阈值下的准确率、召回率
    labelsAndPreds_label_1 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 1)
    labelsAndPreds_label_0 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 0)
    t_cnt = labelsAndPreds_label_1.count()
    f_cnt = labelsAndPreds_label_0.count()
    print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall"
    for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        tp = labelsAndPreds_label_1.filter(lambda lp: lp[2] > thre).count()
        tn = t_cnt - tp
        fp = labelsAndPreds_label_0.filter(lambda lp: lp[2] > thre).count()
        fn = f_cnt - fp
        print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" %
              (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) /
               (t_cnt)))

    # 保存模型、加载模型
    model.save(
        sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel")
    sameModel = LogisticRegressionModel.load(
        sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel")

    print "output:", outputHDFS
def create_or_load_model(sc: SparkContext,
                         train_dataset_path: str) -> LogisticRegressionModel:
    if not os.path.exists(MODEL_PATH):
        print('training model...')
        dataset_rdd = sc.textFile(train_dataset_path)
        table_rdd = dataset_rdd.map(lambda line: line.split(','))
        labeled_features = rdd_to_feature(table_rdd)
        # labeled_features.foreach(lambda lp: print(lp))
        labeled_features.cache()
        model = LogisticRegressionWithLBFGS.train(labeled_features,
                                                  numClasses=NUM_CLASSES)
        model.setThreshold(0.5)
        model.save(sc, MODEL_PATH)
        return model
    else:
        model = LogisticRegressionModel.load(sc, MODEL_PATH)
        return model
    def do_stuff(self, parameters):
        val = parameters.values()
        list = val.head()
        size = list.size()
        pylist = []
        count = 0
        while count < size:
            pylist.append(list.head())
            count = count + 1
            list = list.tail()


        heat = pylist[0]
        km = pylist[1]
        lrm = LogisticRegressionModel.load(self.context, "/tmp/brakeModel")
        worn = lrm.predict([km,heat])
        return ("brake is worn=", worn)
Ejemplo n.º 16
0
def main():

    parser = argparse.ArgumentParser(
        description='Park or Bird Prediction Engine')
    parser.add_argument('--i',
                        '--input',
                        type=str,
                        required=True,
                        default=None,
                        help='Input file or directory of jpg images')
    parser.add_argument('--m',
                        '--method',
                        type=str,
                        required=True,
                        default=None,
                        help='Model method, 1 or 2')
    args = parser.parse_args()

    outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz'
    os.system('rm -f ' + outfile)

    sc = SparkContext(appName="Park Bird Predction Model 1")

    args.m = args.m if args.m in [1, 2] else 2
    model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2'

    CreateTestData(args.i, args.m, outfile)

    raw_input = sc.textFile(outfile)
    k = raw_input.map(lambda x: x.split(',')[0])
    p = raw_input.map(lambda x: x.split(',')[1]).map(
        lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map(
            lambda x: Vectors.dense(x))

    model = LogisticRegressionModel.load(sc, model_path)
    predictions = model.predict(p)
    keyPredictions = k.zip(
        predictions.map(lambda x: "IT'S A BIRD!"
                        if x == 1 else "IT'S A PARK!"))

    print("************* RESULTS *******************")
    print keyPredictions.collect()

    sc.stop()
Ejemplo n.º 17
0
def main(sc):
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("logistic_regression_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LogisticRegressionWithLBFGS.train(parsedData)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "myModelPath")
    sameModel = LogisticRegressionModel.load(sc, "myModelPath")
Ejemplo n.º 18
0
    encoding='utf8',
    header=True,
    inferSchema=True)
test_data = test_data.rdd

# %%
#将测试集的特征转为向量
test = test_data.map(lambda line:
                     (line[0], line[1], line[2], Vectors.dense(line[3:])))

# %% [markdown]
# ## Logistic Regression

# %%
from pyspark.mllib.classification import LogisticRegressionModel
lr_model = LogisticRegressionModel.load(
    sc, "hdfs://node1:9000/user/root/exp4/models/LogisticRegressionModel")

# %%
lr_predictions = test.map(lambda line:
                          (line[0], line[1], float(lr_model.predict(line[3]))))
lr_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:08:52 排名: 无
# score:0.5015744
# %% [markdown]
# ## SVM

# %%
from pyspark.mllib.classification import SVMModel
Ejemplo n.º 19
0
from pyspark.mllib.linalg import Vectors

# create the spark-context
sc = SparkContext('local', 'pyspark')

# load and parse the data
data = sc.textFile("hdfs:///user/events/test_data_spark/")
splits = data.map(lambda line: line.split(',')).filter(lambda x: x[5] != '\\N')

# the user-event label
user_event = splits.map(lambda fields: (fields[1], fields[2]))
# extract the features
features = splits.map(lambda fields: Vectors.dense(fields[3:]))

# load the model
sameModel = LogisticRegressionModel.load(sc, "hdfs:///user/events/model/LR")
# predict the users-interest
predictions = sameModel.predict(features.map(lambda p: (p[0:])))

# re-organize the label-prediction
label_prediction = user_event.zip(predictions).map(lambda (
    (userid, eventid), interested): (int(userid), int(eventid), interested))

# create the header
header = sc.parallelize(["user,event,interested"])
# wrap the label-prediction
lines = label_prediction.map(lambda v:
                             (str(v[0]) + "," + str(v[1]) + "," + str(v[2])))
# save the result into HDFS
(header +
 lines).repartition(1).saveAsTextFile("hdfs:///user/events/predictions")
        from pyspark.mllib.linalg import Vectors 
        
        SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
        SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
        SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
        SparkContext.setSystemProperty('spark.cores.max', args.core_max)

        sc = SparkContext(args.sp_master, 'single_predict:'+str(args.row_id))
        flag_model = ml_opts['learning_algorithm']        
        save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str

        if flag_model == "linear_svm_with_sgd":
            mllib_model = SVMModel.load(sc, save_dir)
            col_num = len(mllib_model.weights)
        elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd":
            mllib_model = LogisticRegressionModel.load(sc, save_dir)
            col_num = mllib_model.numFeatures # len(mllib_model.weights) return 3x value
        elif flag_model == "kmeans":
            mllib_model = KMeansModel.load(sc, save_dir)
            col_num =len(mllib_model.clusterCenters[0])
        else:
            print "ERROR: Training model selection error: no valid ML model selected!"
            return
        # get the model dimension
        #col_num = len(mllib_model.weights)
        print "INFO: total feature # in mllib model=",col_num

        # calculate hypothesis value ================
        model_weight=None
        if learning_algorithm not in ("kmeans") :
            model_weight=mllib_model.weights
Ejemplo n.º 21
0
data = sc.textFile("sample_svm_data.txt")
# 注意这里要用LabeledPoint来生成feature列
parsedData = data.map(parsePoint)

# 使用LBFGS进行优化
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(
    parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "pythonLogisticRegressionWithLBFGSModel")
sameModel = LogisticRegressionModel.load(
    sc, "pythonLogisticRegressionWithLBFGSModel")

###################################################################
# 随机森林
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
# Maximum number of bins used for splitting features. 特征值的最大箱数,分割区间数
model = RandomForest.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     numTrees=3,
Ejemplo n.º 22
0
    # get the command-line arguments
    args = get_cli_args()

    # create a label encoder from a local json file that contains the set
    l_encoder = label_encoders_from_json_file(args.labels, args.category)

    # ---------------  Choose the Operation to perform =-------------- #
    if args.operation.lower() == "train":
        # get/create spark context
        sc = get_spark_context("Train/Update LR Model")

        # load initial weights if it's an update operation
        init_model = None
        if args.update:
            print("---> Loading model")
            LogisticRegressionModel.load(sc, args.model)
            print("---> OK")

        # do the train job
        model = perform_train_job(sc, args.input, l_encoder,
                                          initial_model=init_model,
                                          evaluate=args.evaluate,
                                          category=args.category)
        # save the model weights as a csv file
        try:
            system("hdfs dfs -rm -r " + args.model)
        except:
            print "Failed to delete model: ", args.model
        print("---> Saving LR model")
        model.save(sc, args.model)
        print("---> OK")
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("data/mllib/sample_svm_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LogisticRegressionWithLBFGS.train(parsedData)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel")
    sameModel = LogisticRegressionModel.load(sc,
                                             "target/tmp/pythonLogisticRegressionWithLBFGSModel")
    # $example off$
Ejemplo n.º 24
0
    # get the command-line arguments
    args = get_cli_args()

    # create a label encoder from a local json file that contains the set
    l_encoder = label_encoders_from_json_file(args.labels, args.category)

    # ---------------  Choose the Operation to perform =-------------- #
    if args.operation.lower() == "train":
        # get/create spark context
        sc = get_spark_context("Train/Update LR Model")

        # load initial weights if it's an update operation
        init_model = None
        if args.update:
            print("---> Loading model")
            LogisticRegressionModel.load(sc, args.model)
            print("---> OK")

        # do the train job
        model = perform_train_job(sc, args.input, l_encoder,
                                  initial_model=init_model,
                                  evaluate=args.evaluate,
                                  category=args.category)
        # save the model weights as a csv file
        try:
            system("hdfs dfs -rm -r " + args.model)
        except:
            print "Failed to delete model: ", args.model
        print("---> Saving LR model")
        model.save(sc, args.model)
        print("---> OK")
Ejemplo n.º 25
0
def _load_pre_trained_model():
    ''' load trained LogisticRegressionModel model'''
    trained_model = LogisticRegressionModel.load(sc, "model/SGD")
    # trained_model = LogisticRegressionModel.load(sc, "model/LBFGS")
    return trained_model
Ejemplo n.º 26
0
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))
Ejemplo n.º 27
0
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = LogisticRegressionWithLBFGS.train (training, numClasses=3)

#evaluate the model on training data
labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features)))

#labelAndPreds = testData.map (lambda x: (x.label, model.predict (x.features)))
trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ())

print ('Time consumed = '), (datetime.now() - startTime)

print ("Training error = " + str (trainErr))

#save and load model
model.save(sc, "LRW-95-08")
sameModel = LogisticRegressionModel.load(sc, "LRW-95-08")
sc.stop ()
"""metrics = MulticlassMetrics(labelAndPreds)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)"""
Ejemplo n.º 28
0
    ]

    #Cancelled becomes the 5th column now, and total columns in the data = 5
    label = clean_line_split[4]
    nonLabel = clean_line_split[0:4]

    return LabeledPoint(label, nonLabel)


parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3], seed=11L)
#start timer at this point
startTime = datetime.now()
#build the model"""
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
training.cache()

#evaluate the model on training data
labelAndPreds = test.map(lambda x: (x.label, model.predict(x.features)))
trainErr = labelAndPreds.filter(lambda (w, x): w != x).count() / float(
    test.count())
print('Time consumed = '), (datetime.now() - startTime)

print("Training error = " + str(trainErr))

#save and load model
model.save(sc, "LRN-95-08")
sameModel = LogisticRegressionModel.load(sc, "LRN-95-08")
sc.stop()
	"""
	symbolic_indexes = [5, 7, 12, 18, 21]
	clean_line_split = [item for i, item in enumerate (line_split) if i in symbolic_indexes]
	
	#Cancelled becomes the 5th column now, and total columns in the data = 5
	label = clean_line_split[4]
	nonLabel = clean_line_split[0:4]

	return LabeledPoint (label, nonLabel)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit ([0.7, 0.3], seed=11L)
#start timer at this point
startTime = datetime.now()
#build the model"""
model = LogisticRegressionWithLBFGS.train (training, numClasses=3)
training.cache ()

#evaluate the model on training data
labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features)))
trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ())
print ('Time consumed = '), (datetime.now() - startTime)

print ("Training error = " + str (trainErr))

#save and load model
model.save (sc, "LRN-2008")
sameModel = LogisticRegressionModel.load (sc, "LRN-2008")
sc.stop ()
Ejemplo n.º 30
0
#geolocator = Nominatim()

sparkConf = SparkConf().setMaster("local").setAppName("Predict").set("spark.app.id", "Predict")
sc = SparkContext(conf=sparkConf)
inp = sc.textFile("testing.txt").map(lambda row: row.split(" "))
word2vec = Word2Vec()
model = word2vec.fit(inp)
WordVectors = {}

for i in model.getVectors().keys():
    WordVectors[i] = model.findSynonyms(i,7)

Positive = open(os.getcwd() + "/positive.txt").read().splitlines()
Negative = open(os.getcwd() + "/negative.txt").read().splitlines()

sameModel = LogisticRegressionModel.load(sc, "model")

from pymongo import MongoClient
client = MongoClient('localhost:27017')
db=client.test
tweetList = []
tweets=db.tweetdb.find()
for tweet in tweets:
	l=[]
	entity_names = ""
	loc = ""
	if tweet.get('place'):
		loc=str(tweet['place']['full_name']).split(',')[-1]
		print loc
	text = tweet['text']	
	
Ejemplo n.º 31
0
# In[7]:

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
labelsAndPreds.sample(False, 0.03).collect()

# In[8]:

trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
    parsedData.count())
print("Training Error = " + str(trainErr))

# In[10]:

model.save(sc, "model_ex2.mod")
sameModel = LogisticRegressionModel.load(sc, "model_ex2.mod")

# In[11]:

from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

feats = parsedData.map(lambda p: p.features)
Mdata = RowMatrix(parsedData.map(lambda p: p.features))

# In[12]:

feats.take(4)

# In[13]:
Ejemplo n.º 32
0
           .map(lambda line : line.split(',')) \
           .filter(lambda fields: fields[0] in testdays and \
                                  fields[22] != '')

# these are the fields we used in the regression
# format is LabeledPoint(label, [x1, x2, ...]) 
flights = allfields.map(lambda fields: LabeledPoint(\
              float(float(fields[22]) < 15), #ontime \
              [ \
                  float(fields[15]), # DEP_DELAY \
                  float(fields[16]), # TAXI_OUT \
                  float(fields[26]), # DISTANCE \
              ]))

# read the saved model
lrmodel = LogisticRegressionModel.load(sc,\
             'gs://cloud-training-demos/flights/sparkoutput/model')
print lrmodel.weights,lrmodel.intercept

# how good is the fit?
lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7
labelpred = flights.map(lambda p: (p.label, lrmodel.predict(p.features)))
def eval(labelpred):
    cancel = labelpred.filter(lambda (label, pred): pred == 1)
    nocancel = labelpred.filter(lambda (label, pred): pred == 0)
    corr_cancel = cancel.filter(lambda (label, pred): label == pred).count()
    corr_nocancel = nocancel.filter(lambda (label, pred): label == pred).count()
    return {'total_cancel': cancel.count(), \
            'correct_cancel': float(corr_cancel)/cancel.count(), \
            'total_noncancel': nocancel.count(), \
            'correct_noncancel': float(corr_nocancel)/nocancel.count() \
           }
Ejemplo n.º 33
0
getID = UserDefinedFunction(lambda x: parse_tweet(x)[0], StringType())
getTs = UserDefinedFunction(lambda x: parse_tweet(x)[1], StringType())
getTweet = UserDefinedFunction(lambda x: parse_tweet(x)[2], StringType())

# Apply the UDF using withColumn
tweets = (tweets.withColumn('id', getID(col("data"))).withColumn(
    'ts', getTs(col("data"))).withColumn('Tweet', getTweet(
        col("data")))).toPandas()  # tweets is now a pandas df

# convert tweets pandas df into input tensor for logistic regression model

# df = pd.read_csv()
input_tensor = create_input_tensor(tweets)

# load MlLib model
sameModel = LogisticRegressionModel.load(
    sc, "hdfs:///user/project/llib_logistic.model")
# retrieve sentiments from input tensor using model
tweet_f = input_tensor.to_numpy()
pred = sameModel.predict(tweet_f)
# create DF to send to dashboard

dashboard_df = pd.DataFrame()
dashboard_df['tweet'] = tweets['Tweet']
dashboard_df['ts'] = tweets['ts']
dashboard_df['prediction'] = pred
print(pred)
# send created DF to dashboard
# send_df_to_dashboard(df)

#button action = 'exec(python capture.py' + userinput+')' runs this in EMR instance,
from geopy.geocoders import Nominatim
import json
from Levenstein import Lev
stop_words = nltk.corpus.stopwords.words('english')
stop_words+=['?','.','!',',']
geolocator = Nominatim()
sparkConf = SparkConf().setMaster("local").setAppName("PredictKafkaTweetStreaming").set("spark.app.id", "Predict")
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("WARN")

with open('WordVectors.json') as data_file:    
    WordVectors= json.load(data_file)
Positive = open(os.getcwd() + "/positive.txt").read().splitlines()
Negative = open(os.getcwd() + "/negative.txt").read().splitlines()

sameModel = LogisticRegressionModel.load(sc, "model")

def MakeTuple(l):
	tweet=json.loads(l)
	loc = ""
	coords=""
	#print tweet
	if 'coordinates' in tweet and  not (tweet["coordinates"]  is None):
		#locAll=str(tweet["place"]["full_name"])
		#loc=locAll.split(',')[-1]
		#location= geolocator.geocode(str(tweet['place']['full_name']))
		#coords=str(str(location.longitude) + ',' + str(location.latitude))
		coords=tweet["coordinates"]
	try:
		text = tweet["text"]	
	except KeyError:
Ejemplo n.º 35
0
from pyspark import SparkConf

if __name__ == '__main__':

    print("This is the name of the script: ", sys.argv[0])
    print("Number of arguments: ", len(sys.argv))
    print("The arguments are: ", str(sys.argv))

    queryInputPath = sys.argv[1]
    savedModelPath = sys.argv[2]

    conf = SparkConf()
    conf.setAppName("SpamDetection")
    sc = SparkContext.getOrCreate(conf=conf)

    model = LogisticRegressionModel.load(sc, savedModelPath)

    query = sc.textFile(queryInputPath, use_unicode=False)

    tf = HashingTF(numFeatures=1000)

    def classify(data):
        data2 = data.split()
        datatf = tf.transform(data2)
        classifications = model.predict(datatf)

        return classifications

    classifications = query.map(lambda x: (classify(x), x))

    predictions = classifications.collect()
Ejemplo n.º 36
0
# Save and load model
#model.save(sc, "myModelPath")
#sameModel = SVMModel.load(sc, "myModelPath")


# In[ ]:

from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "myModelPath2")
sameModel = LogisticRegressionModel.load(sc, "myModelPath2")

Ejemplo n.º 37
0
steam_key = "86FE36CEEF0FECD245B5C711C8B82C5A"
CONV64_32 = 76561197960265728
SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
profile_map = {}
famous_map = {}
dota_appid = 570
cur_id32 = -1

model_map = {}
tag_name_map = defaultdict(dict)

sc = SparkContext('local')
for i in range(1, 11):
    model_name = str(i) + "_players_model.model"
    print "load " + model_name
    model = LogisticRegressionModel.load(sc, "models/" + model_name)
    model_map[i] = model

tag_name_map[1][0] = "newbie"
tag_name_map[1][1] = "normal"
tag_name_map[1][2] = "legend"
tag_name_map[1][3] = "divine"

tag_name_map[2][0] = "onlooker"
tag_name_map[2][1] = "effective assiatant"
tag_name_map[2][2] = "warrior"
tag_name_map[2][3] = "main force"
tag_name_map[2][4] = "tower killer"

tag_name_map[3][0] = "pioneer"
tag_name_map[3][1] = "enemy controller"
Ejemplo n.º 38
0
# Initialize features to <number of sensors>-length array, filled with neutral initial sensor value
features = np.zeros(n_sensors)
features.fill(0.5)

# Initialize streaming for specified reporting interval
sc = SparkContext(appName="iotstream_lr_kafka")
interval = sc.accumulator(0)
empty_intervals = sc.accumulator(0)
events = sc.accumulator(0)
ssc = StreamingContext(sc, reporting_interval)
sensor_stream = KafkaUtils.createDirectStream(
    ssc, [topic], {"bootstrap.servers": kafka_server_list})

# Load pre-computed model
model = LogisticRegressionModel.load(sc, modelname)

# Run model on each batch
#sensor_stream.pprint(10)
sensor_stream.foreachRDD(run_model)

# Start reading streaming data
ssc.start()
start_time = time()
ssc.awaitTermination()
finish_time = time()
elapsed_time = finish_time - start_time - empty_intervals.value * reporting_interval - 1.5  # Subtract off time waiting for events and 1.5 sec for termination
print(
    '\n%s.%03dZ: %d events received in %.1f seconds (%d intervals), or %.0f sensor events/second\n'
    % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()),
       (time() * 1000) % 1000, events.value - 1, elapsed_time, interval.value,
Ejemplo n.º 39
0
conf = SparkConf().setAppName("TFIDF").set("spark.executor.memory", "2g") 
sc = SparkContext(conf=conf)

place = "/Users/daniellenash/Downloads/goodValidation/"
placeAdd = ["goodValidation1.txt","goodValidation2.txt",
"goodValidation3.txt","goodValidation4.txt","goodValidation5.txt","goodValidation6.txt",
"goodValidation7.txt","goodValidation8.txt","goodValidation9.txt","goodValidation10.txt"]

place2 = "/Users/daniellenash/Downloads/badValidation/"
placeAdd2 = ["badValidation1.txt","badValidation2.txt",
"badValidation3.txt","badValidation4.txt","badValidation5.txt","badValidation6.txt",
"badValidation7.txt","badValidation8.txt","badValidation9.txt","badValidation10.txt"]

hashingTF = HashingTF(100000)
model = LogisticRegressionModel.load(sc, "/Users/daniellenash/Downloads/spark-1.6.1-bin-hadoop2.6/python/LRModel")


for c in range(0,10):
	currentPlace = place +""+ placeAdd[c]
	
	documents = sc.textFile(currentPlace).filter(lambda x : len(x) > 15)
	docTokens = documents.map(lambda x: x.split(" "))
	
	tf = hashingTF.transform(docTokens)
	idf = IDF(minDocFreq=5).fit(tf)
	tfidf = idf.transform(tf)
	
	val = model.predict(tfidf)

	mapped = val.map(lambda x: (x,1))
Ejemplo n.º 40
0
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics

def parsePoint(line):
    values = [float(x) for x in line]
    return LabeledPoint(values[-1] if values[-1]==1 else 0, values[:-1])

data = sc.textFile('mergedAB_delete_all_empty.csv')
data = data.mapPartitions(lambda x: reader(x))
#header = data.first()
#data = data.filter(lambda x: x != header)
data = data.filter(lambda x: x[-1] in ['1', '-1'])

parsedData = data.map(parsePoint)


# Build the model
model_lr = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds_lr = parsedData.map(lambda p: (p.label, model_lr.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LogisticRegressionModel.load(sc, "myModelPath")
Ejemplo n.º 41
0
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 10)

kafka_configuration_params = {
    "topic": ["BigData"],
    "connectionstring": "localhost:9092"
}

from pyspark.streaming.kafka import KafkaUtils
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, kafka_configuration_params["topic"],
    {"metadata.broker.list": kafka_configuration_params["connectionstring"]})

from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel

LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model")
SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model")
NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model")

import nltk
import random
from nltk.tokenize import word_tokenize

allowed_word_types = ["JJ"]

rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000")
rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect())


def convert_tweet_to_instance(tweets):
def loadLogisticRegressionSparkModel(sc, modelFileName):
    print_(nowStr() + ':', 'loading', modelFileName + '...')
    modelFileName = joinPath(sparkFolder, modelFileName)
    return LogisticRegressionModel.load(sc, modelFileName)
Ejemplo n.º 43
0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
    data = line[1:][:-1]
    values = [float(x) for x in data.split(', ')]
    return LabeledPoint(1 if values[34] > 0.5 else 0, values[:-1])

conf = SparkConf() \
      .setAppName(sys.argv[0])\
      .set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
data = sc.textFile(sys.argv[1])
parsedData = data.map(parsePoint)

model = LogisticRegressionModel.load(sc, "Model_logistc")

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Test Error = " + str(trainErr))
from pyspark.mllib.regression import LabeledPoint
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("sample_svm_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LogisticRegressionWithLBFGS.train(parsedData)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel")
    sameModel = LogisticRegressionModel.load(sc,
                                             "target/tmp/pythonLogisticRegressionWithLBFGSModel")
    # $example off$
    sc.stop()
 def load(self, location):
     try:
         self.model = LogisticRegressionModel.load(self.sc, location)
     except Exception as e:
         raise e