def encode_courses_and_instructors(df):
    course_string_indexer = StringIndexer(inputCol="course", outputCol="course_index")
    instructor_string_indexer = StringIndexer(inputCol="instructor", outputCol="instructor_index")

    encoder = OneHotEncoderEstimator(inputCols=["course_index", "instructor_index"],
                                     outputCols=["course_vec", "instructor_vec"])
    encoder.setDropLast(False)

    pipeline = Pipeline(stages=[course_string_indexer, instructor_string_indexer])

    indexed_data = pipeline.fit(df).transform(df)

    # Make a table mapping course index and course name
    courses_table = indexed_data.select('course_index', 'course').distinct()

    # Make a table mapping instructor index and instructor name
    instructors_table = indexed_data.select('instructor_index', 'instructor').distinct()

    indexed_data = indexed_data.select('course_index', 'instructor_index', 'term', 'year', "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "num_invited", "num_responded")

    return indexed_data, courses_table, instructors_table
dataset_df = dataset_df.withColumn('model_type', sf.lit(1))

# list of categorical features for further hot-encoding
cat_features = [
    "age_bin", "WorkClass", "fnlwgt_bin", "Education", "edu_num_bin",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "cap_gain_bin", "CapitalLoss", "hours_per_w_bin", "NativeCountry"
]

# hot encoding categorical features
for feature in cat_features:
    string_indexer = StringIndexer(inputCol=feature,
                                   outputCol=feature + "_index")
    encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()],
                                     outputCols=[feature + "_vec"])
    encoder.setDropLast(False)
    stages += [string_indexer, encoder]
assembler_inputs = [feature + "_vec" for feature in cat_features]
assembler = VectorAssembler(inputCols=assembler_inputs,
                            outputCol="assembled_inputs")
stages += [assembler]
assembler_final = VectorAssembler(inputCols=["assembled_inputs"],
                                  outputCol="features")
label_indexer = StringIndexer(inputCol=label, outputCol=label + "_idx")
stages += [assembler_final]
stages += [label_indexer]
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(dataset_df)
dataset_transformed = pipeline_model.transform(dataset_df)
cat_dict = []
decode_dict = {}
Exemple #3
0
 def test_attr_spark(self):
     conf = SparkConf().setAppName("toy_test").setMaster('local[2]')
     num_partitions = 2
     enumerator = "join"
     model_type = "regression"
     label = 'target'
     sparkContext = SparkContext(conf=conf)
     sqlContext = SQLContext(sparkContext)
     train_df = sqlContext.read.csv("toy_train.csv", header='true',
                         inferSchema='true')
     test_df = sqlContext.read.csv("toy.csv", header='true',
                         inferSchema='true')
     # initializing stages of main transformation pipeline
     stages = []
     # list of categorical features for further hot-encoding
     cat_features = ['a', 'b', 'c']
     for feature in cat_features:
         string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip")
         encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
         encoder.setDropLast(False)
         stages += [string_indexer, encoder]
     assembler_inputs = [feature + "_vec" for feature in cat_features]
     assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
     stages += [assembler]
     assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
     stages += [assembler_final]
     pipeline = Pipeline(stages=stages)
     train_pipeline_model = pipeline.fit(train_df)
     test_pipeline_model = pipeline.fit(test_df)
     train_df_transformed = train_pipeline_model.transform(train_df)
     test_df_transformed = test_pipeline_model.transform(test_df)
     train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0))
     test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0))
     decode_dict = {}
     counter = 0
     cat = 0
     for feature in cat_features:
         colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
         colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
         for item in colIdx:
             decode_dict[counter] = (cat, item, colIdx[item], counter)
             counter = counter + 1
         cat = cat + 1
     train_df_transform_fin = train_df_transformed.select('features', label, 'model_type')
     test_df_transform_fin = test_df_transformed.select('features', label, 'model_type')
     lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8)
     lr_model = lr.fit(train_df_transform_fin)
     eval = lr_model.evaluate(test_df_transform_fin)
     f_l2 = eval.meanSquaredError
     pred = eval.predictions
     pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
     predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
     converter = IndexToString(inputCol='features', outputCol='cats')
     all_features = list(decode_dict)
     predictions = predictions.collect()
     spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join")
     spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union")
     self.assertEqual(3, len(spark_join.slices))
     print("check1")
     self.assertEqual(spark_join.min_score, spark_union.min_score)
     print("check2")
     self.assertEqual(spark_join.keys, spark_union.keys)
     print("check3")
     self.assertEqual(len(spark_join.slices), len(spark_union.slices))
     print("check4")
     idx = -1
     for sliced in spark_join.slices:
         idx += 1
         self.assertEqual(sliced.score, spark_union.slices[idx].score)
     print("check5")