def get_ml1_pipeline():
    stages = []

    imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS )
    stages.append(imputer)

    ohe_input_cols = []
    ohe_output_cols = []
    for categorical_column in ML1_CATEGORICAL_COLUMNS:
        str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep')
        ohe_input_cols.append(str_indexer.getOutputCol())
        ohe_output_cols.append(categorical_column + "_class_vec")
        stages.append(str_indexer)

    encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False)
    stages.append(encoder)

    numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep")
    scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols")
    stages.append(numerical_vector_assembler)
    stages.append(scaler)

    label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep")
    stages.append(label_str_indexer)

    assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()]
    assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip")
    stages.append(assembler)

    pipeline = Pipeline(stages = stages)
    return pipeline
Ejemplo n.º 2
0
def one_hot(datafile):
    spark=init_spark()
    df=spark.read.format("csv").option("header","true").load(datafile)
    df1=df.select(
    #     "date_account_created"
    # ,"timestamp_first_active",
        "age",
    "gender"
    ,"signup_method"
    ,"signup_flow"
    ,"language"
    ,"affiliate_channel"
    ,"affiliate_provider"
    ,"first_affiliate_tracked"
    ,"signup_app"
    ,"first_device_type"
    ,"first_browser"
    ,"country_destination")
    # print(df1)
    age_average = (df1.agg({"age": "sum"}).collect()[0][0]) / (df1.select("age").count())
    df2=df1.fillna({'age':age_average})
    # df3=df2.withColumn("age", when(df["age"]<=17, age_average).otherwise(df["age"]))
    # indexers = [StringIndexer(inputCol="gender", outputCol="gender_numeric").fit(df2)]
    df3=df2.dropna()
    data_df= df3.withColumn("age", df3["age"].cast(IntegerType()))

    indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_features]

    encoder = OneHotEncoderEstimator(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
    )

    assembler = VectorAssembler(
        inputCols=encoder.getOutputCols(),
        outputCol="cat_features"
    )

    # combine all the numberical_feature togeher
    assembler2=VectorAssembler(
        inputCols=numberical_feature,
        outputCol="num_features"
    )

    pipeline = Pipeline(stages=indexers+[encoder,assembler,assembler2])
    df_r = pipeline.fit(data_df).transform(data_df)
    # df_r.show()


    # combine all the numberical_feature togeher

    return df_r
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator

indexers = [
    StringIndexer(inputCol=c,
                  outputCol="{0}_indexed".format(c)).setHandleInvalid("keep")
    for c in categorical
]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=[
        "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers
    ])

assembler = VectorAssembler(inputCols=encoder.getOutputCols(),
                            outputCol="features")

stages = indexers + [encoder, assembler]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

one_hot_encoder = pipeline.fit(df_train)

df_train_encoded = one_hot_encoder.transform(df_train)

df_train_encoded.show()

df_train_encoded = df_train_encoded.select(["label", "features"])
]
label = StringIndexer(inputCol=target[0], outputCol="label")

#one hot encoding categorical features to reduce dimensionality for model training:
'''note: one-hot encoding maps a categorical feature, represented as a label index, to a binary vector 
with at most a single one-value indicating the presence of a specific feature value from the set of all feature values.'''

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=[
        "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers
    ],
    dropLast=False)

#combining all the feature columns into a single vector column
assembler = VectorAssembler(inputCols=encoder.getOutputCols() + integerCols,
                            outputCol="features")

#creating instance of a logistic regression model
lr = LogisticRegression(maxIter=10)

#laying down pipeline for model fitting
pipeline = Pipeline(stages=indexers + [encoder, assembler, label, lr])

# fit train split  with LogisticRegression model using the pipeline
lr_model = pipeline.fit(trainingData)

#making predictions on test data using the transform method
predictions = lr_model.transform(testData)

#extracting true,predicted and probability to compute log loss
Ejemplo n.º 5
0
def main(spark):
    getCsv()

    schema = StructType([
        StructField("age", IntegerType(), True),
        StructField("workclass", StringType(), True),
        StructField("fnlwgt", IntegerType(), True),
        StructField("education", StringType(), True),
        StructField("education-num", IntegerType(), True),
        StructField("marital-status", StringType(), True),
        StructField("occupation", StringType(), True),
        StructField("relationship", StringType(), True),
        StructField("race", StringType(), True),
        StructField("sex", StringType(), True),
        StructField("capital-gain", IntegerType(), True),
        StructField("capital-loss", IntegerType(), True),
        StructField("hours-per-week", IntegerType(), True),
        StructField("native-country", StringType(), True),
        StructField("salary", StringType(), True)
    ])

    train_df = spark.read.csv('train.csv', header=False, schema=schema)
    test_df = spark.read.csv('test.csv', header=False, schema=schema)

    print(train_df.limit(5).toPandas())

    categorical_variables = [
        'workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'sex', 'native-country'
    ]
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "-index")
        for column in categorical_variables
    ]

    encoder = OneHotEncoderEstimator(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=[
            "{0}-encoded".format(indexer.getOutputCol())
            for indexer in indexers
        ])

    assembler = VectorAssembler(inputCols=encoder.getOutputCols(),
                                outputCol="categorical-features")
    pipeline = Pipeline(stages=indexers + [encoder, assembler])
    train_df = pipeline.fit(train_df).transform(train_df)
    test_df = pipeline.fit(test_df).transform(test_df)

    train_df.printSchema()

    df = train_df.limit(5).toPandas()
    print(df['categorical-features'][1])

    continuous_variables = [
        'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
        'hours-per-week'
    ]
    assembler = VectorAssembler(
        inputCols=['categorical-features', *continuous_variables],
        outputCol='features')
    train_df = assembler.transform(train_df)
    test_df = assembler.transform(test_df)
    print(train_df.limit(5).toPandas()['features'][0])

    indexer = StringIndexer(inputCol='salary',
                            outputCol='label',
                            handleInvalid="skip")
    train_df = indexer.fit(train_df).transform(train_df)
    test_df = indexer.fit(test_df).transform(test_df)

    lr = LogisticRegression(featuresCol='features', labelCol='label')
    model = lr.fit(train_df)

    pred = model.transform(test_df)
Ejemplo n.º 6
0


indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep")
    for c in categorical
]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=[
        "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="features"
)

stages = indexers + [encoder, assembler]

from pyspark.ml import Pipeline


pipeline = Pipeline(stages=stages)


one_hot_encoder = pipeline.fit(df_train)


df_train_encoded = one_hot_encoder.transform(df_train)
Ejemplo n.º 7
0
            tgt_agent_indexer.getOutputCol(),
            tgt_other_agent_indexer.getOutputCol()
        ],
                                     outputCols=[
                                         "country_code_ohe", "geoname_ohe",
                                         "source_ohe", "src_actor_ohe",
                                         "src_agent_ohe",
                                         "src_other_agent_ohe", "target_ohe",
                                         "tgt_actor_ohe", "tgt_agent_ohe",
                                         "tgt_other_agent_ohe"
                                     ],
                                     handleInvalid='keep',
                                     dropLast=True)

        # Combine all features into a single column
        feature_assembler = VectorAssembler(inputCols=ohe.getOutputCols() +
                                            [goldstein_scaler.getOutputCol()],
                                            outputCol="features")

        # Index root_code labels
        label_indexer = StringIndexer(
            inputCol="root_code",
            outputCol="indexedLabel").setHandleInvalid('skip')

        # Select a subset of important features
        feature_selector = ChiSqSelector(
            percentile=0.5,
            featuresCol=feature_assembler.getOutputCol(),
            labelCol=label_indexer.getOutputCol(),
            outputCol="selected_features")
Ejemplo n.º 8
0
numColumns = [
    item[0] for item in df.dtypes if not item[1].startswith('string')
]
catColVectors = [c + '_vector' for c in catColumns]

# Change categorical values into numeric
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in catColumns
]
encoder = OneHotEncoderEstimator(
    inputCols=[c + "_index" for c in catColumns],
    outputCols=[c + "_vector" for c in catColumns])

assembler = VectorAssembler(inputCols=encoder.getOutputCols() + numColumns,
                            outputCol="features")

label_stringIdx = StringIndexer(inputCol="income", outputCol="label")

pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler])
encoded_df = pipeline.fit(df).transform(df)

selectedCols = ['label', 'features'] + cols
dataset = encoded_df.select(selectedCols)

# Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())
#BUILD FEATURES
#Pred windows 1, 5, 10, 20, 60
stockcreator = StockFeatureCreator(lags=10,
                                   pred_window=1,
                                   ma_windows=[3, 5, 10, 20, 50, 80, 100],
                                   tickers=["aapl", "sp500"])
stockcreator.build()

#ONE HOT CATEGORICAL DATE FEATURES
inputs = [s for s in time_cols if s not in ["Date"]]
encoder = OneHotEncoderEstimator(
    inputCols=inputs,
    outputCols=[s + "_Vec" for s in time_cols if s not in ["Date"]])

#VECTOR ASSEMBLER
features = stockcreator.getOutputCols() + encoder.getOutputCols(
)  #getOutoutCols returning empty list
features = [
    col for col in features if (col != "label") or (col not in time_cols)
]
featureassembler = VectorAssembler(inputCols=features, outputCol="features")

#SPLIT
finalized_data = df.withColumn(
    "rank",
    percent_rank().over(Window.partitionBy().orderBy("Date")))
train_data = finalized_data.where("rank <= .9").drop("rank")
test_data = finalized_data.where("rank > .9").drop("rank")

#FEATURE SELECTION
selector = ChiSqSelector(numTopFeatures=300,
                         featuresCol="features",