Beispiel #1
0
def analyze(spark, user, items, ratings, faculty):
    items_array = items
    ratings_array = ratings
    # load data
    data_util = DataUtil(spark)

    db_df = data_util.load_all_df(faculty)
    item_df = data_util.get_item_df(db_df)
    # db_df.printSchema()
    new_df = data_util.create_df_from_new_data(user, items_array, ratings_array, faculty)
    # new_df.printSchema()
    new_item_df = data_util.get_item_df(new_df)
    # print(item_df.count())
    input_df = db_df.union(new_df)
    # preprocess
    input_after_mapping_df = data_util.mapping_course(input_df)
    # input_df.show()
    item_df = data_util.mapping_course(item_df)
    # print(item_df.distinct().count())

    new_item_df = data_util.mapping_course(new_item_df)
    # data_util.course_mapper.mapping_df.show()
    # input_df.printSchema()

    # index item
    item_indexer = StringIndexer().setInputCol(data_util.get_item_col()).setOutputCol("F_MAMH_index")
    item_indexer_model = item_indexer.fit(input_after_mapping_df)
    input_index_df = item_indexer_model.transform(input_after_mapping_df) \
        .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType()))
    # input_index_df.show()
    # input_index_df.printSchema()

    # get missing item
    missing_item_df = item_indexer_model.transform(item_df.subtract(new_item_df)) \
        .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType())) \
        .withColumn("MASV1", lit(user).cast(IntegerType()))

    # missing_item_df.show()
    # print(item_df.count())
    # print(new_item_df.count())
    # print(missing_item_df.count())
    spark.conf.set("spark.sql.crossJoin.enabled", "true")

    # create model
    als_nn = ALS(rank=2, maxIter=15, regParam=0.01, userCol="MASV1",
              itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True)

    als_model = als_nn.fit(input_index_df)
    output_df = als_model.transform(missing_item_df)
    # test_df = als_model.transform(input_index_df)
    # test_df.show()
    # output_df.show()
    OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
Beispiel #2
0
def analyze(spark):
    # input_predict_data = [
    #     (1512400, "CO3059", 10.0),
    #     (1512400, "CO3031", 9.5),
    #     (1512400, "CO3055", 9.0),
    #     (1512400, "CO4027", 9.5),
    #     (1512400, "CO3029", 8.0),
    #     (1512400, "CO3021", 10.0),
    #     (1512400, "IM3001", 9.0),
    #     (1512400, "MT2001", 7.5),
    #     (1512400, "SP1007", 8.5),
    #     (1512400, "MT1005", 8.5),
    #     (1512400, "PH1003", 7.5),
    #     (1512400, "CO3043", 0.0),
    #     (1512400, "CO3025", 1.0),
    #     (1512400, "CO4313", 2.0)
    #     ]
    # schema = StructType([
    #     StructField("MASV1", IntegerType(), True),
    #     StructField("F_MAMH", StringType(), True),
    #     StructField("TKET", DoubleType(), True)])

    # inputDF = spark.createDataFrame(input_predict_data, schema)

    data_util = DataUtil(spark)
    list_faculty = [
        "MT", "BD", "CK", "DC", "DD", "GT", "HC", "MO", "PD", "QL", "UD", "VL",
        "VP", "XD"
    ]

    estimator = FPGEstimator(spark, data_util.get_user_col(),
                             data_util.get_item_col(),
                             data_util.get_rating_col(), 0.2, 0.8)

    for faculty in list_faculty:
        data_df = data_util.mapping_course(data_util.load_all_df(faculty))\
            .filter(col(data_util.get_rating_col()) >= 5)
        transformer = estimator.fit(
            data_df.select(data_util.get_user_col(), data_util.get_item_col()))
        # transformer.transform(inputDF).show()
        transformer.save("model/{}/fp".format(faculty))
Beispiel #3
0
def analyze(spark, user, items, ratings, faculty):
    data_util = DataUtil(spark)
    model_location = "model/{}/nbcf".format(faculty)
    item_df = data_util.load_all_df(faculty).select(
        data_util.get_item_col()).distinct()

    new_data = data_util.mapping_course(
        data_util.create_df_from_new_data(user, items, ratings, faculty))

    missing_data = item_df.subtract(new_data.select(data_util.get_item_col()).distinct())\
        .withColumn(data_util.get_user_col(), lit(user).cast(IntegerType()))

    ubcf_model = NBCFTransformer.load(spark, model_location)

    output_df = ubcf_model.transform(missing_data)
    OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
Beispiel #4
0
def analyze(spark, user, items, ratings, faculty):
    items_array = items
    ratings_array = ratings
    spark.conf.set("spark.sql.crossJoin.enabled", "true")

    model_location = "model/{}/fp".format(faculty)

    # load data
    data_util = DataUtil(spark)

    new_df = data_util.mapping_course(
        data_util.create_df_from_new_data(user, items_array, ratings_array,
                                          faculty))
    # new_df.show()
    model_transformer = FPGTransformer.load(spark, model_location)

    recommend_df = model_transformer.transform(new_df)

    # recommend_df.show()

    # predict rating recommended items
    db_df = data_util.mapping_course(data_util.load_all_df(faculty))
    # db_df.printSchema()
    input_df = db_df.union(new_df).withColumn("TKET",
                                              col("TKET").cast(DoubleType()))
    # input_df.printSchema()

    # index item
    item_indexer = StringIndexer().setInputCol(
        data_util.get_item_col()).setOutputCol("F_MAMH_index")
    item_indexer_model = item_indexer.fit(input_df)
    input_index_df = item_indexer_model.transform(input_df) \
        .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType()))

    recommend_df = item_indexer_model.transform(recommend_df)
    # create model
    als = ALS(rank=2,
              maxIter=15,
              regParam=0.01,
              userCol="MASV1",
              itemCol="F_MAMH_index",
              ratingCol="TKET",
              coldStartStrategy="drop",
              nonnegative=True)

    als_model = als.fit(input_index_df)
    output_df = als_model.transform(recommend_df)
    OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
Beispiel #5
0
def analyze(spark, user, items, ratings, faculty):
    data_util = DataUtil(spark)
    model_location = "model/{}/als_ibcf".format(faculty)
    item_df = data_util.mapping_course(data_util.load_all_df(faculty)).select(
        data_util.get_item_col()).distinct()
    # start = time.time()
    new_data = data_util.mapping_course(
        data_util.create_df_from_new_data(user, items, ratings, faculty))
    missing_data = item_df.subtract(new_data.select(data_util.get_item_col()).distinct())\
        .withColumn(data_util.get_user_col(), lit(user).cast(IntegerType()))
    # missing_data.count()
    # end = time.time()
    # print(str(end - start))
    als_ibcf_model = IBCFWithItemFactor(spark)\
        .load(model_location)\
        .setUserCol(data_util.get_user_col())\
        .setItemCol(data_util.get_item_col())\
        .setValueCol(data_util.get_rating_col())

    output_df = als_ibcf_model.transform(new_data, missing_data)
    # print(item_df.
    # input_index_df.show()
    # input_index_df.printSch
    # test_df = als_model.transform(input_index_df)
    # test_df.show()
    # output_df.show()
    OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)