Esempio n. 1
0
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys",
        "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl",
        "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance",
        "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=5,
                             featuresCol="indexedFeatures",
                             labelCol="Position",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()
Esempio n. 2
0
def preprocessed_df(df, label="flg_cmd_lowcostIndex"):
    max_values_to_define_str_cols = 10
    id_col = 'ID_CLIENT'

    dty = dict(df.dtypes)
    str_cols = [k for k, v in dty.items() if v == 'string']
    str_cols.remove(id_col)

    for c in str_cols:
        stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index")
        model_str = stringIndexer.fit(df)
        df = model_str.transform(df).drop(c)

    input_cols = df.columns
    input_cols.remove(id_col)
    input_cols.remove(label)

    assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
    df = assembler.transform(df)

    featureIndexer = VectorIndexer(
        inputCol="features",
        outputCol="indexedFeatures",
        maxCategories=max_values_to_define_str_cols).fit(df)
    return featureIndexer.transform(df), df
def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Edad", "Genero", "Zona", "Fumador_Activo",
        "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria",
        "Tension_sistolica", "Tension_diastolica", "Colesterol_Total",
        "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno",
        "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina",
        "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos",
        "Estatina", "Antidiabeticos", "Adherencia_tratamiento"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=15)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=15,
                             featuresCol="indexedFeatures",
                             labelCol="Diabetes",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show(100)
Esempio n. 4
0
 def test_model_vector_indexer_single(self):
     vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
     data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ),
                                        (Vectors.dense([0.0]), ),
                                        (Vectors.dense([0.0]), )], ["a"])
     model = vi.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml VectorIndexer Single',
         [('a', FloatTensorType([None, model.numFeatures]))],
         target_opset=9)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.toPandas().indexed.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = data.toPandas().a.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorIndexerSingle")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['indexed'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Esempio n. 5
0
def training_data_prepare(spark, filename):
    # 若只是处理较少数据时,为提升速度应启用该函数
    header, rdd = fw.read_data(spark, filename)
    # 初步过滤表头
    header = list(header.split(','))
    header.pop(0)

    # # 若要进行并发应该启用该函数
    # header, rdd = fw.read_list_data(spark, filename)
    # 打印数据的记录数、最大值、最小值、平均值以及标准差,以及计算皮尔逊相关系数
    personal_array = data_description(spark, rdd.cache(), header)
    # 打印数据的皮尔逊相关系数
    personal_show(personal_array, header)
    # 将文件中数量级过大的数据转换为合适的规格
    rdd = annual_premium_scaler(spark, rdd, header)
    # 将线性相关性强的特征进行合并
    rdd, header = damaged_couple(rdd, header)
    # 将年龄进行定性化
    rdd = arrange_for_age(rdd, header)
    # 使用卡方验证二次清洗数据
    cleaned_rdd, cleaned_header = useful_select(spark, rdd, header)

    def map_fuc_rdd(row):
        ret = []
        for i in range(len(row) - 1):
            ret.append(row[i])

        return ret, row[-1]

    cleaned_rdd = cleaned_rdd.map(map_fuc_rdd)

    # 设置PCA降维的维度
    # n = 3
    # pp.PCA_builder(spark,rdd,n)
    # 将数据格式转换为机器学习所要求的格式
    def map_fuc(row):
        features_array = np.array(row[0])
        index_array = np.arange(features_array.size)
        num = features_array.size

        return row[1], Vectors.sparse(num, index_array, features_array)

    labeled_points_rdd = cleaned_rdd.map(map_fuc)
    # print(labeled_points_rdd.first())
    data = spark.createDataFrame(labeled_points_rdd,
                                 schema=['label', 'indexedFeatures'])
    # 使用特征转换器进行对数据的进一步处理
    data = VectorIndexer(inputCol="indexedFeatures",
                         outputCol="features",
                         maxCategories=4).fit(data).transform(data)
    # 筛选掉无用的列
    data = data.drop("indexedFeatures")
    data = balanceDataset(data)
    # 将数据集分为训练集和测试集
    training_data, check_data = data.randomSplit([0.7, 0.3])
    # 对小类进行过采样
    training_data = enlarge_data(training_data, 0.15)
    return training_data.cache(), check_data.cache(), cleaned_header
    def chiSquareTest(self,categoricalFeatures,maxCategories):
        dataset=self.dataset
        labelColm=self.labelColm
        features=self.features
        length = features.__len__()

        featureassembler = VectorAssembler(
            inputCols=self.features,
            outputCol="featuresChiSquare", handleInvalid="skip")
        dataset= featureassembler.transform(dataset)

        vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))

        dataset = vec_indexer.transform(dataset)

        # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure')
        # finalized_data.show()

        # using chi selector
        selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure",
                                 outputCol="selectedFeatures",
                                 labelCol=labelColm)

        result = selector.fit(dataset).transform(dataset)

        print("chi2 output with top %d features selected " % selector.getNumTopFeatures())
        result.show()

        # runnin gfor the chi vallue test

        r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head()
        p_values = list(r.pValues)
        PValues = []
        for val in p_values:
            PValues.append(round(val, 4))
        print(PValues)
        dof = list(r.degreesOfFreedom)
        stats = list(r.statistics)
        statistics = []
        for val in stats:
            statistics.append(round(val, 4))
        print(statistics)
        chiSquareDict = {}
        for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures):
            print(pval, doF, stat)
            chiSquareDict[colm] = pval, doF, stat
        chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics']
        print(chiSquareDict)

        result = {'pvalues': chiSquareDict}

        return result
Esempio n. 7
0
    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model
Esempio n. 8
0
def testVectorIndexer(spark, data):

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexed",
                            maxCategories=10)
    indexerModel = indexer.fit(data)

    categoricalFeatures = indexerModel.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categoricalFeatures), ", ".join(
              str(k) for k in categoricalFeatures.keys())))

    # Create new column "indexed" with categorical values transformed to indices
    indexedData = indexerModel.transform(data)
    indexedData.show()
Esempio n. 9
0
def test_data_prepare(spark, filename, header):
    # 若只是处理较少数据时,为提升速度应启用该函数
    test_header, rdd = fw.read_data(spark, filename)
    # 初步清洗表头
    test_header = list(test_header.split(','))
    test_header.pop(0)

    # # 若要进行并发应该启用该函数
    # test_header, rdd = fw.read_list_data(spark, filename)
    # 将文件中数量级过大的数据转换为合适的规格
    rdd = annual_premium_scaler(spark, rdd, test_header)
    # 将相关性强的特征进行合并
    rdd, test_header = damaged_couple(rdd, test_header)
    # 将年龄进行定性化
    rdd = arrange_for_age(rdd, test_header)
    # 根据清洗后的训练集的表头筛选要保留的列并将其记录到列表中
    num_array = []
    for i in range(len(test_header)):
        if test_header[i] in header:
            num_array.append(i)

    # 根据列表过滤数据
    def map_fuc(row):
        ret = []
        for n in num_array:
            ret.append(row[n])

        return ret

    rdd = rdd.map(map_fuc)

    # 将格式转换为机器学习算法所需要的格式
    def Vectors_map_fuc(row):
        features_array = np.array(row)
        index_array = np.arange(features_array.size)
        num = features_array.size

        return (Vectors.sparse(num, index_array, features_array), )

    labeled_points_rdd = rdd.map(Vectors_map_fuc)
    data = spark.createDataFrame(labeled_points_rdd,
                                 schema=['indexedFeatures'])
    data = VectorIndexer(inputCol="indexedFeatures",
                         outputCol="features",
                         maxCategories=4).fit(data).transform(data)
    # 过滤掉不需要的表头
    data = data.drop("indexedFeatures")
    return data.cache()
Esempio n. 10
0
def training(df):
    # 0. load the cleanning data
    df_cleanning = df.select("id").distinct()
    # Split the data into training and test sets (30% held out for testing)
    (df_training, df_test) = df_cleanning.randomSplit([0.7, 0.3])

    # 1. load the training data
    # 准备训练集合
    df_result = df
    df_result = df_result.select("id", "label", "features")
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(df_result)
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=6).fit(df_result)

    df_training.show(10)
    # 1.1 构建训练集合
    df_training = df_training.join(df_result, how="left", on="id")
    df_training.show()
    print(df_training.count())

    # 1.2 构建测试集合
    df_test = df_test.join(df_result, how="left", on="id")
    df_test.show()
    print(df_test.count())

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")

    # Chain indexers and tree in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(df_training)

    # Make predictions.
    df_predictions = model.transform(df_test)

    # Select example rows to display.
    df_predictions.show(10)
    df_predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(df_predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    # summary only
    print(treeModel)
    model.write().overwrite().save(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/pfizer_model/0.0.4/model_without_prod"
    )
    print(treeModel.toDebugString)

    return treeModel
def decision_tree_classifier(trainingDataFrame,
                             maxCategories=4,
                             maxDepth=5,
                             maxBins=32,
                             minInstancesPerNode=1,
                             minInfoGain=0.0,
                             maxMemoryInMB=256,
                             cacheNodeIds=False,
                             checkpointInterval=10,
                             impurity="gini",
                             seed=None):
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel"). \
                   setHandleInvalid("keep").fit(trainingDataFrame)
    featureIndexer = VectorIndexer(
        inputCol="features",
        outputCol="indexedFeatures",
        maxCategories=maxCategories).fit(trainingDataFrame)
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                maxDepth=maxDepth,
                                maxBins=maxBins,
                                minInstancesPerNode=minInstancesPerNode,
                                minInfoGain=minInfoGain,
                                maxMemoryInMB=maxMemoryInMB,
                                cacheNodeIds=cacheNodeIds,
                                checkpointInterval=checkpointInterval,
                                impurity=impurity,
                                seed=seed)
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    dtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = dtModel
    result["summary"] = dtModel.stages[2]
    return result
Esempio n. 12
0
 def trainModel(self, trainingData):
     """ Ham huan luyen du lieu
     Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing
     """
     labelIndexer = StringIndexer(
         inputCol="label", outputCol="indexedLabel").fit(trainingData)
     featureIndexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=4).fit(trainingData)
     rf = RandomForestClassifier(labelCol="indexedLabel",
                                 featuresCol="indexedFeatures",
                                 numTrees=30,
                                 maxDepth=5,
                                 maxBins=32,
                                 seed=None,
                                 impurity="gini")
     labelConverter = IndexToString(inputCol="prediction",
                                    outputCol="predictedLabel",
                                    labels=labelIndexer.labels)
     pipeline = Pipeline(
         stages=[labelIndexer, featureIndexer, rf, labelConverter])
     model = pipeline.fit(trainingData)
     model.write().overwrite().save(os.path.join(self.modelpath,
                                                 "detector"))
     return model
Esempio n. 13
0
def main():

    # 1. Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*")

    #3. Transform data
    af = (text_file.map(getVals))

    #4. Create a DataFrame out of this using the toDF method and cache it
    afdf = af.toDF([
        'acousticness', 'danceability', 'energy', 'instrumentalness',
        'liveness', 'loudness', 'duration'
    ]).cache()

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf)

    #5. Create a train/test split with 70% of data in training set and 30% of data in test set
    afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123)

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(afdf_train)

    # Make predictions.
    predictions = model.transform(afdf_test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    rfModel = model.stages[1]
    print(rfModel)  # summary only

    #Step 3: Building our Pipelines

    rfModel.save('s3a://spotifybuck/model-export' +
                 datetime.now().strftime('%Y%m%d%H%M'))
    pipeline.save('s3a://spotifybuck/pipeline-export' +
                  datetime.now().strftime('%Y%m%d%H%M'))

    sc.stop()
Esempio n. 14
0
def model(classifiers, training, testing, week):

    results = ""
    timing = []

    for classifier in classifiers:

        timeStart = time.time()

        clf = get_classifier(classifier)

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures")

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
        model = pipeline.fit(training)

        prediction = model.transform(testing)

        metrics = BinaryClassificationMetrics(
            prediction.select("label", "prediction").rdd)

        results = results + "new," + classifier + "," + week + "," + str(
            metrics.areaUnderROC) + "," + str(metrics.areaUnderPR) + "\n"

        timing.append(time.time() - timeStart)

    return results, timing
  def prepare(self):
    data = (self.spark_session.read.format(self.data_format)
            .load(self.data_file))
    labelIndexer = StringIndexer(
        inputCol="label", outputCol="indexedLabel").fit(data)
    featureIndexer = (VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=self.max_categories)
                      .fit(data))
    self.train_data, self.valid_data = data.randomSplit([0.8, 0.2])
    if self.model_builder.__name__ == 'DecisionTreeClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures")
    elif self.model_builder.__name__ == 'RandomForestClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures",
                                      numTrees=self.num_trees)
      labelConverter = IndexToString(inputCol="prediction",
                                     outputCol="predictedLabel",
                                     labels=labelIndexer.labels)
    elif self.model_builder.__name__ == 'GBTClassifier':
      classifier = self.model_builder(labelCol="indexedLabel",
                                      featuresCol="indexedFeatures",
                                      maxIter=self.max_iter)

    if self.model_builder.__name__ == 'RandomForestClassifier':
      self.pipeline = Pipeline(stages=[labelIndexer,
                                       featureIndexer,
                                       classifier,
                                       labelConverter])
    else:
      self.pipeline = Pipeline(stages=[labelIndexer,
                                       featureIndexer,
                                       classifier])
Esempio n. 16
0
 def _fit(self,
          dataset,
          estimator,
          estimatorParamMaps,
          samplingrates,
          numfolds=5):
     all_list = dataset.columns
     all_list.remove('Class')  #所有特征列名
     assembler = VectorAssembler().setInputCols(all_list).setOutputCol(
         "features_vector")  #特征列转换为一列向量
     labelIndexer = StringIndexer(inputCol="Class",
                                  outputCol="label")  #统一标签列名称为label
     featureIndexer = VectorIndexer(
         inputCol="features_vector", outputCol="features",
         maxCategories=10)  #统一特征向量列名称,不同值数量小于10视作离散变量编号
     pipeline = Pipeline(stages=[labelIndexer, featureIndexer,
                                 estimator])  #机器学习流建模,三部分整合
     dataset = assembler.transform(dataset)  #训练集生成特征向量列
     best_epm, best_sampling, metricsX = self.Cross_Validation(
         dataset, estimator, estimatorParamMaps, samplingrates,
         numfolds)  #交叉验证
     bestModel = pipeline.fit(
         dataset.sampleBy("Class", fractions={
             1.0: 1.0,
             0.0: best_sampling
         }), best_epm)  # fit最优模型并输出
     return bestModel, best_epm, best_sampling
Esempio n. 17
0
    def dtr(self):
        # Load and parse the data file, converting it to a DataFrame.
        data = self.session.read.format("libsvm").load(self.dataDir + "/data/mllib/sample_libsvm_data.txt")

        # Automatically identify categorical features, and index them.
        # Set maxCategories so features with > 4 distinct values are treated as continuous.
        featureIndexer = \
            VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = data.randomSplit([0.7, 0.3])

        # Train a GBT model.
        drg = DecisionTreeRegressor(featuresCol="indexedFeatures")

        # Chain indexer and GBT in a Pipeline
        pipeline = Pipeline(stages=[featureIndexer, drg])

        # Train model.  This also runs the indexer.
        model = pipeline.fit(trainingData)

        # Make predictions.
        predictions = model.transform(testData)

        # Select example rows to display.
        predictions.select("prediction", "label", "features").show(5)

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(
            labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

        gbtModel = model.stages[1]
        print(gbtModel)  # summary only
Esempio n. 18
0
def DecisionTree():
    IrisData = spark.sparkContext.textFile("file:///home/unbroken/MyFiles/Work/Programming/Spark/DecisionTree/Iris.txt")\
    .map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    IrisData.createOrReplaceTempView("iris")
    df = spark.sql("select * from iris")
    labelIndexer = StringIndexer(inputCol='label',
                                 outputCol='labelIndex').fit(IrisData)
    featureIndexer = VectorIndexer(
        inputCol='feature',
        outputCol='indexFeature').setMaxCategories(4).fit(IrisData)
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictionLabel').setLabels(
                                       labelIndexer.labels)
    trainningData, testingData = IrisData.randomSplit([0.7, 0.3])
    dtClassifier = DecisionTreeClassifier().setLabelCol(
        'labelIndex').setFeaturesCol('indexFeature')
    pipelineClassifier = Pipeline().setStages(
        [labelIndexer, featureIndexer, dtClassifier, labelConverter])
    modelClassifier = pipelineClassifier.fit(trainningData)
    prediction = modelClassifier.transform(testingData)
    print(prediction.show())

    evaluator = MulticlassClassificationEvaluator().setLabelCol(
        'labelIndex').setPredictionCol('prediction').setMetricName("accuracy")
    accuracy = evaluator.evaluate(prediction)
    print(accuracy)

    treeModelClassifier = modelClassifier.stages[2]
    print("Learned classification tree model:\n" +
          str(treeModelClassifier.toDebugString))
def entrenar(df):
    vectorAssembler = VectorAssembler(inputCols=[
        "Position", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing",
        "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing",
        "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions",
        "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                      outputCol="features")
    stringIndexer = StringIndexer(inputCol="Position",
                                  outputCol="indexedLabel")
    vectorIndexer = VectorIndexer(inputCol="features",
                                  outputCol="indexedFeatures")

    # Division en data de entrenamiento y data de test
    (training_df, test_df) = df.randomSplit([0.7, 0.3])

    # Configurar Red Neuronal
    capas = [13, 13, 13, 2]
    entrenador = MultilayerPerceptronClassifier(layers=capas,
                                                featuresCol="indexedFeatures",
                                                labelCol="indexedLabel",
                                                maxIter=10000)

    # Entrenar mi RN
    pipeline = Pipeline(
        stages=[vectorAssembler, stringIndexer, vectorIndexer, entrenador])
    return pipeline.fit(training_df), test_df
    def trainModel(self, trainingData):
        """ Ham huan luyen du lieu
        Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing
        """
        # Chuyen toan bo nhan thanh so neu chua chuyen
        # trainingData.select("label").groupBy("label").count().show()
        labelIndexer = StringIndexer(
            inputCol="label", outputCol="indexedLabel").fit(trainingData)
        # Chuyen toan bo gia tri thuoc tinh thanh so neu chua chuyen
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=4).fit(trainingData)
        # Khai bao thuat toan RandomForest
        rf = RandomForestClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures",
                                    numTrees=30,
                                    maxDepth=5,
                                    maxBins=32,
                                    seed=None,
                                    impurity="gini")
        # Chuyen nhan du doan duoc tu dang so ve dang ban dau,
        labelConverter = IndexToString(inputCol="prediction",
                                       outputCol="predictedLabel",
                                       labels=labelIndexer.labels)
        # Hop nhat tat ca cac buoc thanh mot luong duy nhat pipeline
        pipeline = Pipeline(
            stages=[labelIndexer, featureIndexer, rf, labelConverter])

        # Train model qua pipeline
        model = pipeline.fit(trainingData)
        model.write().overwrite().save(os.path.join(self.modelpath,
                                                    "detector"))
        return model
Esempio n. 21
0
 def train_boosted_regression(self,
                              depth=2,
                              n_trees=50,
                              learning_rate=.01,
                              max_cats=6):
     '''
     train dataset on boosted decision trees
     --------
     Parameters
     depth: int -  max_allowable depth of decision tree leafs
     n_trees: int - max number of iterations
     learning_rate: int - rate which the model fits
     --------
     '''
     featureIndexer = \
     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=max_cats).fit(self.train)
     gbr = GBTRegressor(labelCol='label',
                        featuresCol="features",
                        maxDepth=depth,
                        maxIter=n_trees,
                        stepSize=learning_rate,
                        maxMemoryInMB=2000)
     pipeline = Pipeline(stages=[featureIndexer, gbr])
     # Train model.  This also runs the indexer.
     self.model = pipeline.fit(self.train)
Esempio n. 22
0
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each
    def vector_index(cls,
                     input_column,
                     max_categories,
                     output_column="features"):
        """
        author: [email protected]
        Appends a column to the df, containing all the values of required columns for every record.

        :param df: dataframe containing data to be processed
        :param feature_list: list of required columns to be considered for vector assembling
        :param features_column: name for the newly appended column
        :return: vector assembled df
        """
        try:
            cls.logger.debug("Columns to vector index: " + str(input_column))
            cls.logger.info("Vector indexing data")
            vector_indexer = VectorIndexer(inputCol=input_column,
                                           outputCol=output_column,
                                           maxCategories=max_categories,
                                           handleInvalid='keep')
            return vector_indexer

        except Exception as exp:
            cls.logger.error(
                'Exception occured while applying vector indexer using feature_list : '
                + str(input_column))
            raise DataFrameException(exp)
Esempio n. 24
0
def UsefulnessPredictionSentmentWithoutCV(trainingdata, model):
    # Data Preprocessing
    assembler = VectorAssembler(inputCols=[
        'num', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos',
        'sentiment_compound', 'Character_adj', 'Character_noun',
        'Character_verb', 'Character_adv'
    ],
                                outputCol="features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4)

    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="indexedFeatures")

    pipeline = Pipeline(stages=[assembler, featureIndexer, model])

    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")

    Model = pipeline.fit(trainingdata)

    return Model
Esempio n. 25
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Esempio n. 26
0
def train(data, max_depth, max_bins):
    print("Parameters: max_depth: {}  max_bins: {}".format(
        max_depth, max_bins))
    #     spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate()

    # Load the data stored in LIBSVM format as a DataFrame.
    #     data = spark.read.format("libsvm").load(os.environ['DSX_PROJECT_DIR']+data_path)

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    label_indexer = StringIndexer(inputCol="label",
                                  outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    feature_indexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=4).fit(data)

    # Split the data into training and test sets
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_bins", max_bins)
    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                maxDepth=max_depth,
                                maxBins=max_bins)

    # Chain indexers and tree in a Pipeline.
    pipeline = Pipeline(stages=[label_indexer, feature_indexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error.
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    test_error = 1.0 - accuracy
    print("Test Error = {} ".format(test_error))

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("test_error", test_error)

    tree_model = model.stages[2]
    print(tree_model)

    mlflow.spark.log_model(model, '')

    spark.stop()
Esempio n. 27
0
 def DonusumuBaslat(self):
         sp_df = self.spark_df
         messagebox.showinfo("Uyarı","Dönüşüm Başladı")
         self.data_f = self.get_dummy()
         self.data_f.show(25,False)
         self.labelIndexer = StringIndexer(inputCol='label',outputCol='indexedLabel').fit(self.data_f)
         self.labelIndexer.transform(self.data_f).show(25,False)
         self.featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(self.data_f)
         self.featureIndexer.transform(self.data_f).show(25,False)
         self.labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=self.labelIndexer.labels)
         if self.testTxt.get()=='':
             messagebox.showinfo("Hata","Lütfen Test oranını girin")
         else:
            deger = self.testTxt.get()
            testPoint=float(deger)/100
            (self.trainingData, self.testData) = self.data_f.randomSplit([1.0-testPoint, testPoint], seed = 100)
            messagebox.showinfo("Başarılı","Oran Hesaplandı")
            self.DonusumBtn.grid_remove()
Esempio n. 28
0
def decision_tree_regression(trainingDataFrame, maxCategories=4):
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                   maxCategories=maxCategories).fit(trainingDataFrame)
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, dt])
    dtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = dtModel
    result["summary"] = dtModel.stages[1]
    return result
Esempio n. 29
0
def random_forest_regression(trainingDataFrame, maxCategories=4, numTrees=10):
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                   maxCategories=4).fit(trainingDataFrame)
    rf = RandomForestRegressor(featuresCol="indexedFeatures", numTrees=numTrees)
    pipeline = Pipeline(stages=[featureIndexer, rf])
    rfModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = rfModel
    result["summary"] = rfModel.stages[1]
    return result
Esempio n. 30
0
def gradient_boosted_tree_regression(trainingDataFrame, maxCategories=4, maxIter=10):
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                   maxCategories=maxCategories).fit(trainingDataFrame)
    gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=maxIter)
    pipeline = Pipeline(stages=[featureIndexer, gbt])
    gbtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = gbtModel
    result["summary"] = gbtModel.stages[1]
    return result
from pyspark.ml.feature import IndexToString
labelReverse = IndexToString().setInputCol("labelInd")
labelReverse.transform(idxRes).show()


# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()


# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()


# COMMAND ----------
Esempio n. 32
0
from __future__ import print_function

# $example on$
from pyspark.ml.feature import VectorIndexer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorIndexerExample")\
        .getOrCreate()

    # $example on$
    data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
    indexerModel = indexer.fit(data)

    categoricalFeatures = indexerModel.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

    # Create new column "indexed" with categorical values transformed to indices
    indexedData = indexerModel.transform(data)
    indexedData.show()
    # $example off$

    spark.stop()