Beispiel #1
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(
        lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant', 'items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result = model.associationRules
    modelResult = model.freqItemsets
    result=modelResult.join(result,modelResult['items']==result["consequent"])
    total = df.count()

    result = result.withColumn("interest",abs(result["confidence"]-result["freq"]/total))
    result = result.select(size("antecedent").alias('tam'), 'antecedent', 'consequent', 'confidence',"items","freq","interest")
    result = result.sort(desc('tam'), desc('interest')).limit(n)
    result=result.select('antecedent', 'consequent', 'confidence',"items","freq","interest")

    return toCSVLine(result)
Beispiel #2
0
def frequent_itemsets(filename, n, s, c):
    '''
    Using the FP-Growth algorithm from the ML library (see 
    http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), 
    write a function that returns the first <n> frequent itemsets 
    obtained using min support <s> and min confidence <c> (parameters 
    of the FP-Growth model), sorted by (1) descending itemset size, and 
    (2) descending frequency. The FP-Growth model should be applied to 
    the DataFrame computed in the previous task. 
    
    Return value: a CSV string. As before, using toCSVLine may help.
    Test: tests/test_frequent_items.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:]))
    df = spark.createDataFrame(rdd_data)
    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    model_1 = model.freqItemsets.orderBy([size("items"), "freq"],
                                         ascending=[0, 0])
    final_op = toCSVLine(model_1.limit(n))
    return final_op
    '''return "not implemented"'''
Beispiel #3
0
def frequent_itemsets(filename, n, s, c):
    '''
    Using the FP-Growth algorithm from the ML library (see 
    http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), 
    write a function that returns the first <n> frequent itemsets 
    obtained using min support <s> and min confidence <c> (parameters 
    of the FP-Growth model), sorted by (1) descending itemset size, and 
    (2) descending frequency. The FP-Growth model should be applied to 
    the DataFrame computed in the previous task. 
    
    Return value: a CSV string. As before, using toCSVLine may help.
    Test: tests/test_frequent_items.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant','items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result=model.freqItemsets


    result=result.select("items","freq",size("items").alias("tam"))
    result=result.sort(desc('tam'),desc('freq')).limit(n)
    result=result.select('items','freq')

    return toCSVLine(result)
Beispiel #4
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:]))
    df = spark.createDataFrame(rdd_data)
    total_count = df.count()
    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    model_updated = model.associationRules.join(
        model.freqItemsets,
        model.associationRules['consequent'] == model.freqItemsets['items'])
    model_with_interest = model_updated.withColumn(
        "interest",
        lit(
            calculate_interest(model_updated.confidence, model_updated.freq,
                               total_count)))
    model_1 = model_with_interest.drop("lift")
    model_2 = model_1.orderBy([size("antecedent"), "interest"],
                              ascending=[0, 0])
    final_op = toCSVLine(model_2.limit(n))
    return final_op
Beispiel #5
0
def process():

    data_content = [x.strip().split(',') for x in open(FILE_PATH).readlines()]
    data_content_tuple = []
    for i in range(0, len(data_content)):
        data_content_tuple.append((i, data_content[i]))

    df = spark.createDataFrame(data_content_tuple, ["id", "items"])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.5)
    model = fpGrowth.fit(df)

    # Display frequent itemsets.
    # model.freqItemsets

    model.freqItemsets.filter(size('items') > 0).orderBy('freq',
                                                         ascending=0).show(
                                                             50, False)

    print(type(model.freqItemsets))

    # Display generated association rules.
    model.associationRules.orderBy('confidence', ascending=0).show(200, False)

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(df).show(50, False)
Beispiel #6
0
def fp_growth(df):
    training, test = data_process(df)
    fpGrowth = FPGrowth(itemsCol='itemset', minSupport=0.1, minConfidence=0.2)
    model = fpGrowth.fit(training)

    # Display frequent itemsets.
    # model.freqItemsets.show()

    # Display generated association rules.
    rules = model.associationRules
    rules = get_valid_rules(rules)
    rules.show()

    # Display the predicted purchasing.
    # res = model.transform(test).orderBy('prediction', ascending=False)

    # Calculate conversion rate.
    res = test.join(rules, test.itemset == rules.antecedent).select(
        test["*"], rules["prediction"])
    conversion = F.udf(lambda x, y: 0 if len(set(x) & set(y)) == 0 else 1,
                       IntegerType())
    res = res.withColumn('conversion', conversion('ground_truth',
                                                  'prediction'))
    res.show()

    print("The total size of testset is: %d     K = %d" % (test.count(), K))
    total_c = res.count()
    print("The number of total recommendation is: %d" % total_c)
    total_v = res.agg(F.sum('conversion')).collect()[0][0]
    print("The number of correct recommendation is: %d" % total_v)

    print(df.count())
    def rules(self):
        dataset = self._dataset()
        transactions_count = dataset.count()
        fp = FPGrowth(minSupport=self._min_support_count * 1.0 /
                      transactions_count,
                      minConfidence=self._min_confidence,
                      itemsCol="items",
                      numPartitions=self._partitions)
        fpm = fp.fit(dataset)

        association_rules = (
            fpm.associationRules
            # 只保留长度为 1 的结果
            .filter((size("antecedent") == 1)
                    & (size("consequent") == 1)).withColumn(
                        'antecedent',
                        col("antecedent")[0]).withColumn(
                            'consequent',
                            col('consequent')[0]))
        window = Window.partitionBy(association_rules.antecedent).orderBy(
            association_rules.lift.desc())
        association_rules = (association_rules.select(
            '*',
            rank().over(window).alias('rank')).filter(
                col('rank') <= self._top_n).select("antecedent", "consequent",
                                                   "lift"))
        return association_rules
Beispiel #8
0
def main():

    spark = SparkSession \
        .builder \
        .getOrCreate()

    spark.sparkContext.setCheckpointDir('gs://reddit_data_soen498/checkpoint/')
    
    @udf("boolean")
    def isNotDefault(x):
        defaultSubs = ["Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews"]
        return x not in defaultSubs
    
    data = spark.read.json("gs://reddit_data_soen498/RC_2018-02.json")
    keep = [data.author, data.id, data.subreddit]
    data = data.select(*keep)
    data = data.filter(data.author != "[deleted]")
    data = data.filter(isNotDefault(data.subreddit))

    data = data.groupBy(data.author).agg(F.collect_set("subreddit").alias("items"))
    size_ = udf(lambda xs: len(xs), IntegerType())
    data = data.filter(size_(data.items) > 1)
    data = data.select(data.items)
    support = 200/data.count()
    fp = FPGrowth(minSupport=support, minConfidence=0.5)
    fpm = fp.fit(data)
    fpm.associationRules.show(100)
    
    fpm.save("gs://reddit_data_soen498/modelFP_noDefaultSub_20support")
 def _run_FPGrowth(self, df):
     # Apply spark ml libs FP-growth algorithm for frequent itemset mining
     fpGrowth = FPGrowth(itemsCol="chordItems",
                         minSupport=self.params["minSupport"],
                         minConfidence=self.params["minConfidence"])
     model = fpGrowth.fit(df)
     return model
def build_association_rule_model(item_set, min_support, min_confidence):
    # Use a low support as we have a large dataset
    fp_growth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence)
    
    print('Fitting FPGrowth....')
    model = fp_growth.fit(item_set)
    print('Fit Complete')
    return model
def SurvivalIndexTimeout(timeoutpidsmap):
	global spcon
	sqlcon = SQLContext(spcon)
	timeoutdf=sqlcon.createDataFrame(timeoutpidsmap,['index','process_ids'])
	fpGrowth=FPGrowth(itemsCol="process_ids",minSupport=0.5,minConfidence=0.5)
	fpModel=fpGrowth.fit(timeoutdf)
	fpModel.freqItemsets.show()
	fpModel.associationRules.show()
 async def get_model(self, df, min_support=0.1, min_confidence=0.6):
     fpGrowth = FPGrowth(itemsCol="items",
                         minSupport=min_support,
                         minConfidence=min_confidence)
     model = fpGrowth.fit(df)
     return model.freqItemsets.sort(
         "freq", ascending=False), model.associationRules.sort(
             "confidence", ascending=True), model
Beispiel #13
0
    def arRules(self, transaction):
        spark = SparkSession.builder.getOrCreate()

        R = Row('ID', 'items')  # use enumerate to add the ID column
        df = spark.createDataFrame([R(i, x) for i, x in enumerate(transaction)])
        fpGrowth = FPGrowth(itemsCol='items', minSupport=0.0001, minConfidence=0.0001)
        model = fpGrowth.fit(df)
        rules = model.associationRules.collect()  # Display generated association rules.
        return rules
Beispiel #14
0
def recommendation(data, conf, outputpath):

    sparkSession = SparkSession.builder.getOrCreate()

    # extract algorithm parameters from conf file
    MyMinConfidence = conf["minConfidence"]
    MyMinSupport = conf["minSupport"]
    MyNumPartitions = conf["numPartitions"]
    MyMinFavorScore = conf["minfavorscore"]
    MyResultSavePath = os.path.join("hdfs://{0}".format(outputpath),
                                    "FPresult.json")
    MyModelSavePath = os.path.join("hdfs://{0}".format(outputpath), "FPmodel")

    print("============train FPmodel==============")

    df = sparkSession.createDataFrame(transformData(data, MyMinFavorScore),
                                      ["userId", "productIds"])
    fpGrowth = FPGrowth(itemsCol="productIds",
                        minSupport=MyMinSupport,
                        minConfidence=MyMinConfidence)
    model = fpGrowth.fit(df)

    print("============save association rules==============")
    # if the length of result is 0
    if model.associationRules.count() == 0:
        print(
            "============no association rules! retry to change algorithm parameters =============="
        )
    else:
        # determine if the file exists
        (ret, out,
         err) = MyUtil.run_cmd(['hdfs', 'dfs', '-test', '-e', MyModelSavePath])
        # if file already exists,then delete it
        if ret == 0:
            print(MyModelSavePath + " file alreay exits")
            MyUtil.run_cmd(['hdfs', 'dfs', '-rm', '-r', MyModelSavePath])
        print(MyModelSavePath + " file dosen't exit")
        model.save(MyModelSavePath)
    print("============save association results==============")
    # if the length of result is 0
    if model.associationRules.count() == 0:
        print(
            "============no association rules! retry to change algorithm parameters =============="
        )
    else:
        # determine if the file exists
        (ret, out, err) = MyUtil.run_cmd(
            ['hdfs', 'dfs', '-test', '-e', MyResultSavePath])
        # if file already exists,then delete it
        if ret == 0:
            print(MyResultSavePath + " file alreay exits")
            MyUtil.run_cmd(['hdfs', 'dfs', '-rm', '-r', MyResultSavePath])
        else:
            print(MyResultSavePath + " file dosen't exit")
        model.transform(df).write.json(MyResultSavePath)
Beispiel #15
0
    def test_freq_itemsets(self):
        fp = FPGrowth()
        fpm = fp.fit(self.data)

        expected_freq_itemsets = self.spark.createDataFrame(
            [([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)], ["items", "freq"]
        )
        actual_freq_itemsets = fpm.freqItemsets

        self.assertEqual(actual_freq_itemsets.subtract(expected_freq_itemsets).count(), 0)
        self.assertEqual(expected_freq_itemsets.subtract(actual_freq_itemsets).count(), 0)
Beispiel #16
0
    def test_association_rules(self):
        fp = FPGrowth()
        fpm = fp.fit(self.data)

        expected_association_rules = self.spark.createDataFrame(
            [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)],
            ["antecedent", "consequent", "confidence", "lift"]
        )
        actual_association_rules = fpm.associationRules

        self.assertEqual(actual_association_rules.subtract(expected_association_rules).count(), 0)
        self.assertEqual(expected_association_rules.subtract(actual_association_rules).count(), 0)
Beispiel #17
0
    def test_freq_itemsets(self):
        fp = FPGrowth()
        fpm = fp.fit(self.data)

        expected_freq_itemsets = self.spark.createDataFrame(
            [([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)],
            ["items", "freq"]
        )
        actual_freq_itemsets = fpm.freqItemsets

        self.assertEqual(actual_freq_itemsets.subtract(expected_freq_itemsets).count(), 0)
        self.assertEqual(expected_freq_itemsets.subtract(actual_freq_itemsets).count(), 0)
Beispiel #18
0
def interests(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that computes 
    the interest of association rules (interest = |confidence - 
    frequency(consequent)|; note the absolute value)  obtained using 
    min support <s> and min confidence <c> (parameters of the FP-Growth 
    model), and prints the first <n> rules sorted by (1) descending 
    antecedent size in association rule, and (2) descending interest.

    Return value: a CSV string.
    Test: tests/test_interests.py
    '''
    spark = init_spark()

    frame = construct()

    frame2 = frame.withColumn("items", explode(frame.items))

    frame2 = frame2.groupBy("items").count().sort(desc("count"))

    frame2 = frame2.withColumnRenamed("items", "consequent2")
    frame2 = frame2.withColumnRenamed("count", "freq")

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)

    model = fpGrowth.fit(frame)

    model = model.associationRules

    model = model.withColumn("consequent2", explode(model.consequent))

    model = model.join(frame2, "consequent2", "inner")

    model = model.withColumn(
        "interest", lit(abs(model.confidence - (model.freq / frame.count()))))

    model = model.select("*", size("antecedent"))

    model = model.withColumnRenamed("size(antecedent)", "ln")

    model = model.sort(desc("ln"), desc("interest"))

    model = model.select("antecedent", "consequent", "confidence",
                         "consequent", "freq", "interest")

    model = model.limit(n)

    string = toCSVLine(model)

    print(string)

    return string
Beispiel #19
0
    def arRules(self, transaction):
        spark = SparkSession.builder.config("spark.executor.memory",
                                            MAX_MEMORY).config(
                                                "spark.driver.memory",
                                                MAX_MEMORY).getOrCreate()

        R = Row('ID', 'items')  # use enumerate to add the ID column
        df = spark.createDataFrame(
            [R(i, x) for i, x in enumerate(transaction)])
        fpGrowth = FPGrowth(itemsCol='items',
                            minSupport=0.001,
                            minConfidence=0.001)
        model = fpGrowth.fit(df)
        return model
Beispiel #20
0
def test_freq_itemsets():
    data = spark.createDataFrame([([1, 2], ), ([1, 2], ), ([1, 2, 3], ),
                                  ([1, 3], )], ["items"])

    fp = FPGrowth()
    fpm = fp.fit(data)

    expected_freq_itemsets = spark.createDataFrame([([1], 4), ([2], 3),
                                                    ([2, 1], 3), ([3], 2),
                                                    ([3, 1], 2)],
                                                   ["items", "freq"])
    actual_freq_itemsets = fpm.freqItemsets

    assert actual_freq_itemsets.subtract(expected_freq_itemsets).count() == 0
    assert expected_freq_itemsets.subtract(actual_freq_itemsets).count() == 0
Beispiel #21
0
def test_association_rules():
    data = spark.createDataFrame([([1, 2], ), ([1, 2], ), ([1, 2, 3], ),
                                  ([1, 3], )], ["items"])

    fp = FPGrowth()
    fpm = fp.fit(data)

    expected_association_rules = spark.createDataFrame(
        [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)],
        ["antecedent", "consequent", "confidence", "lift"])

    actual_association_rules = fpm.associationRules

    assert actual_association_rules.subtract(
        expected_association_rules).count() == 0
    assert expected_association_rules.subtract(
        actual_association_rules).count() == 0
def cluster(request):
    unique_fields = custom_fields(request)
    # First, read the data
    data_df = read_df(request, 'clean')
    data_df.cache()
    json_df = data_df.toPandas()
    json_df.to_json()

    # Create a tuple of id and items from the Data Frame
    dd = []
    for p in data_df:
        dd.append(p)

    data = []
    for row in json_df.itertuples():
        id = row[1]
        items = []

        for column in range(2, (len(dd) + 1)):
            items.append(row[column])
        data.append((id, items))

    # Create a Data Frame from the data dictionary
    final_data = Spark.sqlContext.createDataFrame(data, ["id", "items"])

    # Create the FPGrowth instance with its arguments and train the model
    fpGrowth = FPGrowth(itemsCol='items', minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(final_data)

    # Frequent Item sets
    itemSets = model.freqItemsets

    # Generated Association Rules
    assocRules = model.associationRules

    # Examines input items against all association rules and summarize consequents as prediction
    prediction = model.transform(data)

    context = {
        'all_data': json_df,
        'itemSets': itemSets,
        'assocRules': assocRules,
        'predicted': prediction
    }
    return render(request, 'show_clusters.html', context)
Beispiel #23
0
    def SAR(self, transaction):
        MAX_MEMORY = "12g"
        spark = SparkSession.builder.master("local").config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", MAX_MEMORY) \
            .config("spark.driver.memory", MAX_MEMORY).getOrCreate()

        R = Row('ID', 'items')  # use enumerate to add the ID column
        df = spark.createDataFrame([R(i, x) for i, x in enumerate(transaction)])

        fp_growth = FPGrowth(itemsCol='items', minSupport=(0.001), minConfidence=(0.001), numPartitions=100)
        df_fit = fp_growth.fit(df)

        freq = df_fit.freqItemsets.collect()
        freq_list = list(filter(lambda x: len(x[0]) > 1, freq))

        rule = df_fit.associationRules.collect()
        rule_list = list(filter(lambda x: x[3] > 1, rule))
        return rule_list, freq_list
    def train(self):
        trainDataList, testDataList = self.doRandomSplitData(self.dbData)
        print("random split. input list size:{}, train size:{}, test size:{}".
              format(len(self.dbData), len(trainDataList), len(testDataList)))
        trainUsrItemMap = self.getItemsForUsr(trainDataList)
        testUsrItemMap = self.getItemsForUsr(testDataList)
        print('trainUsrItemMap len:' + str(len(trainUsrItemMap)) +
              ", testUsrItemMap:" + str(len(testUsrItemMap)))

        trainDf = self.spark.createDataFrame(trainUsrItemMap.items(),
                                             ["id", "items"])
        trainDf.cache()

        fpGrowth = FPGrowth(itemsCol="items",
                            minSupport=self.minSupport,
                            minConfidence=self.minConfidence)
        fgModel = fpGrowth.fit(trainDf)

        associateRules = fgModel.associationRules.collect()
        antecedentTmpList = [value['antecedent'] for value in associateRules]
        antecedentList = []
        [
            antecedentList.append(i) for i in antecedentTmpList
            if not i in antecedentList
        ]
        print('associateRules len:', len(associateRules),
              ', antecedentList len:', len(antecedentList))

        freqItemsets = fgModel.freqItemsets.collect()
        print('freqItemsets len:', len(freqItemsets))

        antecedentPredictionList = self.transformAllAntecdents(
            antecedentList, fgModel)
        print('antecedentPredictionList size:', len(antecedentPredictionList))
        usrPredictionMap = self.predictForUsers(antecedentPredictionList,
                                                trainUsrItemMap)
        print('usrPredictionMap len:' + str(len(usrPredictionMap)))

        totalTP, totalFP, totalFN = self.getTestPrecionAndRecall(
            usrPredictionMap, testUsrItemMap)
        precision = float(totalTP) / float(totalTP + totalFP)
        recall = float(totalTP) / float(totalTP + totalFN)
        print('precision:', precision, ", recall:", recall)
Beispiel #25
0
def main():
    # Read from the transactions database and transactions collection, this will
    # generate a Dataframe object
    print("Reading from transactions db... \n")
    transactions_data = spark_session.read \
        .format("com.mongodb.spark.sql.DefaultSource") \
        .option("database", "transactions") \
        .option("collection", "transactions") \
        .load()
    print('Our read transactions are of the type: ', type(transactions_data),
          '\n')

    print("The generated transactions schema is: \n")
    transactions_data.printSchema()
    print("The data fetched from the dbb is: \n")
    transactions_data.show()

    product_codes = transactions_data.select("ProductCode")
    fpGrowth = FPGrowth(itemsCol="ProductCode",
                        minSupport=0.0001,
                        minConfidence=0.05)

    print('Fitting the model...')
    model = fpGrowth.fit(product_codes)

    # Display frequent itemsets.
    model.freqItemsets.show()

    # Display generated association rules.
    model.associationRules.show(100)

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(transactions_data).show()

    # Simple test stuff to write to the db
    print("Writing to the mongodb")
    model.associationRules.write.format(
        "com.mongodb.spark.sql.DefaultSource") \
        .option("database", "transactions") \
        .option("collection", "recommendations") \
        .mode("append") \
        .save()
Beispiel #26
0
def temp():

    from pyspark.sql import SparkSession
    from pyspark.ml.fpm import FPGrowth
    spark = SparkSession.builder.getOrCreate()

    df = spark.createDataFrame([(0, ['a', 'b', 'e']),
                                (1, ['a', 'b', 'c', 'e']), (2, ['a', 'b'])],
                               ["id", "items"])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(df)

    # Display frequent itemsets.
    model.freqItemsets.show()

    # Display generated association rules.
    model.associationRules.show()

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(df).show()
Beispiel #27
0
    def PAR(self, transaction):
        MAX_MEMORY = "14g"
        spark = SparkSession.builder.master("local").config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", MAX_MEMORY) \
            .config("spark.driver.memory", MAX_MEMORY).getOrCreate()

        R = Row('ID', 'items')  # use enumerate to add the ID column
        df = spark.createDataFrame(
            [R(i, x) for i, x in enumerate(transaction)])

        fp_growth = FPGrowth(itemsCol='items',
                             minSupport=(0.001),
                             minConfidence=(0.001),
                             numPartitions=100)
        freq = fp_growth.fit(df).freqItemsets.collect()

        supp_x = sorted(list(filter(lambda x: len(x[0]) == 1, freq)))
        supp_xy = sorted(list(filter(lambda x: len(x[0]) == 2, freq)))
        supp_x = {k[0]: v for k, v in supp_x if k[0] != '$MISS'}
        supp_xy = list(
            filter(lambda k: k[0][0] != '$MISS' and k[0][1] != '$MISS',
                   supp_xy))

        # Rule Power Factor (RPF)
        par_result = dict()

        for i, j in supp_x.items():
            if (i != '$MISS'):
                par_result[i] = dict()
                for m, n in supp_xy:
                    if m[0] == i and m[1] != '$MISS':
                        par_result[i][m[1]] = (((n / len(transaction))**2) /
                                               (j / len(transaction)), n)
                    elif m[1] == i and m[0] != '$MISS':
                        par_result[i][m[0]] = (((n / len(transaction))**2) /
                                               (j / len(transaction)), n)

        return supp_x, {k: v for k, v in par_result.items() if len(v) > 0}
Beispiel #28
0
def association_rules(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that returns the 
    first <n> association rules obtained using min support <s> and min 
    confidence <c> (parameters of the FP-Growth model), sorted by (1) 
    descending antecedent size in association rule, and (2) descending 
    confidence.

    Return value: a CSV string.
    Test: tests/test_association_rules.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(name=p[0], items=p[1:]))
    df = spark.createDataFrame(rdd_data)
    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    model_1 = model.associationRules.orderBy(
        [size("antecedent"), "confidence"], ascending=[0, 0])
    model_2 = model_1.drop("lift")
    final_op = toCSVLine(model_2.limit(n))
    return final_op
Beispiel #29
0
def association_rules(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that returns the 
    first <n> association rules obtained using min support <s> and min 
    confidence <c> (parameters of the FP-Growth model), sorted by (1) 
    descending antecedent size in association rule, and (2) descending 
    confidence.

    Return value: a CSV string.
    Test: tests/test_association_rules.py
    '''
    spark = init_spark()
    result = spark.sparkContext.textFile(filename).map(lambda l: l.split(",")).zipWithIndex().map(lambda x: (x[1], x[0][0], x[0][1:]))
    df = spark.createDataFrame(result, ['id', 'plant', 'items'])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)
    model = fpGrowth.fit(df)
    result = model.associationRules

    result = result.select(size("antecedent").alias('tam'),'antecedent','consequent', 'confidence')
    result = result.sort(desc('tam'), desc('confidence')).limit(n)
    result=result.select('antecedent','consequent','confidence')

    return toCSVLine(result)
Beispiel #30
0
class FPGEstimator(Estimator):
    def __init__(self, spark, user_col, item_col, grade_col, min_support,
                 min_confidence):
        self.spark = spark
        self.item_col = item_col
        self.user_col = user_col
        self.grade_col = grade_col
        self.list_item_col = self.item_col + "_list"
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.model = FPGrowth(itemsCol=self.list_item_col,
                              minSupport=self.min_support,
                              minConfidence=self.min_confidence,
                              numPartitions=1000)

    def _fit(self, transformed_df):
        train_fp_data = transformed_df.groupBy(self.user_col).agg(
            collect_set(self.item_col).alias(self.list_item_col)).select(
                self.user_col, self.list_item_col)
        # train_fp_data = train_fp_data.cache()
        fp_model = self.model.fit(train_fp_data)
        return FPGTransformer(self.spark, self.user_col, self.item_col,
                              self.list_item_col, self.grade_col,
                              fp_model.associationRules)
Beispiel #31
0
def association_rules(filename, n, s, c):
    '''
    Using the same FP-Growth algorithm, write a script that returns the 
    first <n> association rules obtained using min support <s> and min 
    confidence <c> (parameters of the FP-Growth model), sorted by (1) 
    descending antecedent size in association rule, and (2) descending 
    confidence.

    Return value: a CSV string.
    Test: tests/test_association_rules.py
    '''
    spark = init_spark()

    frame = construct()

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)

    model = fpGrowth.fit(frame)

    model = model.associationRules

    model = model.select("*", size("antecedent"))

    model = model.withColumnRenamed("size(antecedent)", "ln")

    model = model.sort(desc("ln"), desc("confidence"))

    model = model.select("antecedent", "consequent", "confidence")

    model = model.limit(n)

    string = toCSVLine(model)

    #print(string)

    return string
Beispiel #32
0
def frequent_itemsets(filename, n, s, c):
    '''
    Using the FP-Growth algorithm from the ML library (see 
    http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html), 
    write a function that returns the first <n> frequent itemsets 
    obtained using min support <s> and min confidence <c> (parameters 
    of the FP-Growth model), sorted by (1) descending itemset size, and 
    (2) descending frequency. The FP-Growth model should be applied to 
    the DataFrame computed in the previous task. 
    
    Return value: a CSV string. As before, using toCSVLine may help.
    Test: tests/test_frequent_items.py
    '''
    spark = init_spark()

    frame = construct()

    fpGrowth = FPGrowth(itemsCol="items", minSupport=s, minConfidence=c)

    model = fpGrowth.fit(frame)

    model = model.freqItemsets

    model = model.select("*", size("items"))

    model = model.withColumnRenamed("size(items)", "ln")

    model = model.sort(desc("ln"), desc("freq"))

    model = model.select("items", "freq")

    model = model.limit(n)

    string = toCSVLine(model)

    return string
Beispiel #33
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("FPGrowthExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (0, [1, 2, 5]),
        (1, [1, 2, 3, 5]),
        (2, [1, 2])
    ], ["id", "items"])

    fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(df)

    # Display frequent itemsets.
    model.freqItemsets.show()

    # Display generated association rules.
    model.associationRules.show()

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(df).show()
    # $example off$

    spark.stop()