def analyze(spark, user, items, ratings, faculty): items_array = items ratings_array = ratings # load data data_util = DataUtil(spark) db_df = data_util.load_all_df(faculty) item_df = data_util.get_item_df(db_df) # db_df.printSchema() new_df = data_util.create_df_from_new_data(user, items_array, ratings_array, faculty) # new_df.printSchema() new_item_df = data_util.get_item_df(new_df) # print(item_df.count()) input_df = db_df.union(new_df) # preprocess input_after_mapping_df = data_util.mapping_course(input_df) # input_df.show() item_df = data_util.mapping_course(item_df) # print(item_df.distinct().count()) new_item_df = data_util.mapping_course(new_item_df) # data_util.course_mapper.mapping_df.show() # input_df.printSchema() # index item item_indexer = StringIndexer().setInputCol(data_util.get_item_col()).setOutputCol("F_MAMH_index") item_indexer_model = item_indexer.fit(input_after_mapping_df) input_index_df = item_indexer_model.transform(input_after_mapping_df) \ .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType())) # input_index_df.show() # input_index_df.printSchema() # get missing item missing_item_df = item_indexer_model.transform(item_df.subtract(new_item_df)) \ .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType())) \ .withColumn("MASV1", lit(user).cast(IntegerType())) # missing_item_df.show() # print(item_df.count()) # print(new_item_df.count()) # print(missing_item_df.count()) spark.conf.set("spark.sql.crossJoin.enabled", "true") # create model als_nn = ALS(rank=2, maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_model = als_nn.fit(input_index_df) output_df = als_model.transform(missing_item_df) # test_df = als_model.transform(input_index_df) # test_df.show() # output_df.show() OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
def analyze(spark): # input_predict_data = [ # (1512400, "CO3059", 10.0), # (1512400, "CO3031", 9.5), # (1512400, "CO3055", 9.0), # (1512400, "CO4027", 9.5), # (1512400, "CO3029", 8.0), # (1512400, "CO3021", 10.0), # (1512400, "IM3001", 9.0), # (1512400, "MT2001", 7.5), # (1512400, "SP1007", 8.5), # (1512400, "MT1005", 8.5), # (1512400, "PH1003", 7.5), # (1512400, "CO3043", 0.0), # (1512400, "CO3025", 1.0), # (1512400, "CO4313", 2.0) # ] # schema = StructType([ # StructField("MASV1", IntegerType(), True), # StructField("F_MAMH", StringType(), True), # StructField("TKET", DoubleType(), True)]) # inputDF = spark.createDataFrame(input_predict_data, schema) data_util = DataUtil(spark) list_faculty = [ "MT", "BD", "CK", "DC", "DD", "GT", "HC", "MO", "PD", "QL", "UD", "VL", "VP", "XD" ] estimator = FPGEstimator(spark, data_util.get_user_col(), data_util.get_item_col(), data_util.get_rating_col(), 0.2, 0.8) for faculty in list_faculty: data_df = data_util.mapping_course(data_util.load_all_df(faculty))\ .filter(col(data_util.get_rating_col()) >= 5) transformer = estimator.fit( data_df.select(data_util.get_user_col(), data_util.get_item_col())) # transformer.transform(inputDF).show() transformer.save("model/{}/fp".format(faculty))
def analyze(spark, user, items, ratings, faculty): data_util = DataUtil(spark) model_location = "model/{}/nbcf".format(faculty) item_df = data_util.load_all_df(faculty).select( data_util.get_item_col()).distinct() new_data = data_util.mapping_course( data_util.create_df_from_new_data(user, items, ratings, faculty)) missing_data = item_df.subtract(new_data.select(data_util.get_item_col()).distinct())\ .withColumn(data_util.get_user_col(), lit(user).cast(IntegerType())) ubcf_model = NBCFTransformer.load(spark, model_location) output_df = ubcf_model.transform(missing_data) OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
def analyze(spark, user, items, ratings, faculty): items_array = items ratings_array = ratings spark.conf.set("spark.sql.crossJoin.enabled", "true") model_location = "model/{}/fp".format(faculty) # load data data_util = DataUtil(spark) new_df = data_util.mapping_course( data_util.create_df_from_new_data(user, items_array, ratings_array, faculty)) # new_df.show() model_transformer = FPGTransformer.load(spark, model_location) recommend_df = model_transformer.transform(new_df) # recommend_df.show() # predict rating recommended items db_df = data_util.mapping_course(data_util.load_all_df(faculty)) # db_df.printSchema() input_df = db_df.union(new_df).withColumn("TKET", col("TKET").cast(DoubleType())) # input_df.printSchema() # index item item_indexer = StringIndexer().setInputCol( data_util.get_item_col()).setOutputCol("F_MAMH_index") item_indexer_model = item_indexer.fit(input_df) input_index_df = item_indexer_model.transform(input_df) \ .withColumn("F_MAMH_index", col("F_MAMH_index").cast(IntegerType())) recommend_df = item_indexer_model.transform(recommend_df) # create model als = ALS(rank=2, maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_model = als.fit(input_index_df) output_df = als_model.transform(recommend_df) OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)
def analyze(spark, user, items, ratings, faculty): data_util = DataUtil(spark) model_location = "model/{}/als_ibcf".format(faculty) item_df = data_util.mapping_course(data_util.load_all_df(faculty)).select( data_util.get_item_col()).distinct() # start = time.time() new_data = data_util.mapping_course( data_util.create_df_from_new_data(user, items, ratings, faculty)) missing_data = item_df.subtract(new_data.select(data_util.get_item_col()).distinct())\ .withColumn(data_util.get_user_col(), lit(user).cast(IntegerType())) # missing_data.count() # end = time.time() # print(str(end - start)) als_ibcf_model = IBCFWithItemFactor(spark)\ .load(model_location)\ .setUserCol(data_util.get_user_col())\ .setItemCol(data_util.get_item_col())\ .setValueCol(data_util.get_rating_col()) output_df = als_ibcf_model.transform(new_data, missing_data) # print(item_df. # input_index_df.show() # input_index_df.printSch # test_df = als_model.transform(input_index_df) # test_df.show() # output_df.show() OutputUtil(spark, "MASV1", "F_MAMH", "prediction").output(output_df, user)