def test_cloning():

    oht = TransactionEncoder()
    oht.fit(dataset)
    oht2 = clone(oht)

    msg = ("'TransactionEncoder' object has no attribute 'columns_'")
    assert_raises(AttributeError,
                  msg,
                  oht2.transform,
                  dataset)

    trans = oht2.fit_transform(dataset)
    np.testing.assert_array_equal(expect, trans)
Beispiel #2
0
def print_frequent_itemsets(jsonfilename):
    with open('2003_Campaign_Contributions.tsv.json') as f:
        data = json.load(f)
    ls = []
    for col in data["columns"]:
        l = []
        for tp in col["dataTypes"]:
            dt = tp["type"]
            l.append(dt)
        ls.append(l)

    te = TransactionEncoder()
    te_ary = te.fit(ls).transform(ls)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    fi = apriori(df, min_support=0.1, use_colnames=True)
    fi['length'] = fi['itemsets'].apply(lambda x: len(x))
    for i in range(2, 4):
        print(i, "frequent itemset")
        print("------------------------------------")
        tm = fi[(fi['length'] == i) & (fi['support'] >= 0.1)]
        if tm.size == 0:
            print("No itemsets present")
        else:
            print(tm["itemsets"].to_string())
        print("------------------------------------\n")
Beispiel #3
0
    def frequent_set_miner(self, T, K):
        te = TransactionEncoder()
        te_ary = te.fit(T).transform(T)
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_itemsets = apriori(df, min_support=K, use_colnames=True)
        #convert to list of list
        listofList = []
        for item in list(frequent_itemsets['itemsets']):
            list_item = list(item)
            listofList.append(list_item)
        #we just want to have the biggest sets
        patterns = sorted(list(listofList), key=len)
        if len(patterns) > 0:
            frequent = [patterns.pop()]
        else:
            frequent = []
        while len(patterns) > 0:
            candidate = patterns.pop()
            super = True
            for f in frequent:
                if all(elem in f for elem in candidate):
                    super = False
                    break
            if super:
                frequent.append(candidate)

        return frequent
Beispiel #4
0
def mine_rules(itemsets):
    test = itemsets.copy()
    te = TransactionEncoder()
    te_ary = te.fit(test).transform(test, sparse=True)
    df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
    rules = apriori(df, min_support=0.001, use_colnames=True)
    return rules
Beispiel #5
0
def extract(min_sup=0.4,
            i_url="./data/chat_server.csv",
            min_num=1):  # 최소 지지도, 인풋 파일 경로, 최소 조합 수

    data, raw = load_data(i_url)

    te = TransactionEncoder()

    te_ary = te.fit(data).transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # 자주 나오는 키워드 추출
    frequent_items = apriori(df, min_support=min_sup, use_colnames=True)
    frequent_items['length'] = frequent_items['itemsets'].apply(
        lambda x: len(x))

    wunch_input = []
    # print("APRIORI", frequent_items[frequent_items["length"] > min_num]['itemsets'])
    for line in frequent_items[frequent_items["length"] == 1]['itemsets']:
        if len(line) < 1:
            pass
        temp = ""
        for word in line:
            temp = temp + " " + word
        wunch_input.append(temp[1:])

    return wunch_input
Beispiel #6
0
def SelectSpec(path1, year, ex=float(1.0)):
    data = pd.read_excel(path1)
    data_ex = data[data['顺序'] == ex]
    data_site = set(data_ex['样地号'])
    D = {}
    Selsect_set = []
    Site_set = []
    for i in data_site:
        Site_spec = list(set(data_ex[data_ex['样地号'] == i]['物种']))
        Site_set.append(Site_spec)
    te = TransactionEncoder()
    # 进行one-hot编码,0-1
    te_array = te.fit(Site_set).transform(Site_set)
    df = pd.DataFrame(te_array, columns=te.columns_)
    # 用apriori找出频繁项集
    freq = apriori(df, min_support=0.5, use_colnames=True)
    Max_len = 3
    n = 0
    for item in reversed(freq['itemsets']):
        if len(item) >= Max_len:
            # Max_len = len(item)
            Spec_lis = [i for i in item]
            D[n] = []
            Selsect_set.append(Spec_lis)  # 保存所有的物种组合
            D[n].append(Spec_lis)
            n = n + 1
        else:
            break
    # 返回物种集合,以及物种集合所对应的plot
    return Selsect_set,Site_set,data_site
Beispiel #7
0
def SelectSpec(engine, year, ex=str(1.0)):
    data = pd.read_sql(str(year), con=engine)
    data_ex = data[data['顺序'] == ex]
    data_site = set(data_ex['样地号'])
    D = {}
    Selsect_set = []
    Spec_set = []
    for i in data_site:
        Site_spec = list(set(data_ex[data_ex['样地号'] == i]['物种']))
        Spec_set.append(Site_spec)
    te = TransactionEncoder()
    # 进行one-hot编码,0-1
    te_array = te.fit(Spec_set).transform(Spec_set)
    df = pd.DataFrame(te_array, columns=te.columns_)
    # 用apriori找出频繁项集
    freq = apriori(df, min_support=0.5, use_colnames=True)
    Max_len = 0
    n = 0
    for item in reversed(freq['itemsets']):
        if len(item) >= Max_len:
            Max_len = len(item)
            Spec_lis = [i for i in item]
            D[n] = []
            Selsect_set.append(Spec_lis)
            D[n].append(Spec_lis)
            site_temp = []
            for i in range(len(Spec_set)):
                if (set(Spec_set[i]) | item) == set(Spec_set[i]):
                    site_temp.append(list(data_site)[i])
            D[n].append(site_temp)
            n = n + 1
        else:
            break
    # print(D)
    return Selsect_set, D
Beispiel #8
0
def fpg(sent):
    x = dict()
    words = []
    for i in range(len(sent)):
        #print(sent[i])
        words.append((preprocess(sent[i])))
        #print(words)

    #print(classe)
    try:
        te = TransactionEncoder()
        #patterns = pyfpgrowth. find_frequent_patterns(words, 10)
        #rules = pyfpgrowth. generate_association_rules(patterns,0.8)
        te_ary = te.fit(((words))).transform((words))
        words = []

        df_r = pd.DataFrame(te_ary, columns=te.columns_)
        #print(df_r)
        #print(df_r)
        #print(rules)
        fpg = fpgrowth(df_r, min_support=0.1, use_colnames=True,
                       max_len=3)  #fpgrowth
    except ValueError:
        print('Value Error')
    print(fpg)
    return fpg
Beispiel #9
0
def recommend2(request):
    dataset = []
    users = User.objects.all()
    for user in users:
        liked_movies = Movie.objects.filter(user_like=user)
        liked_movie_list = []
        for movie in liked_movies:
            liked_movie_list.append(movie.title)
        dataset.append(liked_movie_list)

    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)

    association = association_rules(frequent_itemsets,
                                    metric="confidence",
                                    min_threshold=0.3)
    associations = []
    i = 0
    while i >= 0:
        try:
            associations.append(association.iloc[i])
            i += 1
        except:
            break

    context = {
        'liked_movies': liked_movies,
        'associations': associations,
    }
    return render(request, 'movies/recommend2.html', context)
Beispiel #10
0
def main():
    baskets = []
    infile = open("StudentsPerformance.csv", "r")
    infile.readline()
    for line in infile:
        line = line.strip('\n')
        basket = line.split(",")
        n1 = int(basket.pop())
        n2 = int(basket.pop())
        n3 = int(basket.pop())
        avg = (n1 + n2 + n3) / 3
        if avg >= 70:
            basket.append("Pass")
        else:
            basket.append("Fail")
        baskets.append(basket)

    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequentItemsets = apriori(df, min_support=0.15, use_colnames=True)
    frequentItemsets['length'] = frequentItemsets['itemsets'].apply(
        lambda x: len(x))
    frequentItemsets = frequentItemsets.sort_values(by='support',
                                                    ascending=False)
    frequentItemsets.to_csv("results.csv", encoding='utf-8', index=False)
def create_weather_df():
    attributes = []
    data = []

    data_start = False

    with open("./data/weather.nominal.arff", "r") as f:
        for line in f.readlines():
            line = line.strip()
            line = line.replace("TRUE", "windy")
            line = line.replace("FALSE", "not_windy")
            line = re.sub(r"yes$", "play", line)
            line = re.sub(r"no$", "no_play", line)

            if data_start:
                data.append(line.split(","))
                continue
            if line.startswith("@attribute"):
                attributes.append(line.split(" ")[1])
            if line.startswith("@data"):
                data_start = True

    te = TransactionEncoder()
    te_ary = te.fit(data).transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    df.to_csv("./data/weather.nominal.csv", index=False)
Beispiel #12
0
def convert_dataset(dataset):
    status('Converting dataset to transaction...')
    data = dataset.values.tolist()
    te = TransactionEncoder()
    te_ary = te.fit(data).transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    return df
Beispiel #13
0
    def extract(self):
        transactions_for_te = []
        for doc in self.transaction_docs:
            transaction = doc["transaction"]
            transactions_for_te.append(transaction.split())

        te = TransactionEncoder()
        oht_ary = te.fit(transactions_for_te).transform(transactions_for_te,
                                                        sparse=True)
        sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary,
                                                      columns=te.columns_)

        if self.__settings_for_app_exist():
            minsup = self.__get_minsup_fraction()
        else:
            minsup = 0.1
            self.mongo.save_new_settings(self.application_label,
                                         self.collection_name_prefix, 10, 2)
        df_aspects = apriori(sparse_df,
                             min_support=minsup,
                             use_colnames=True,
                             max_len=1)

        self.aspect_docs = []
        for i in df_aspects.index:
            aspect = ' '.join(list(df_aspects.loc[i, 'itemsets']))
            self.aspect_docs.append({
                'aspect': aspect,
                'support': df_aspects.loc[i, 'support']
            })
Beispiel #14
0
def apriori():
    """
    使用Apriori算法找出数据的频繁项集,进而分析物品关联度
    :return:
    """

    #导包
    import pandas as pd
    from mlxtend.preprocessing import TransactionEncoder
    from mlxtend.frequent_patterns import apriori
    # 导入关联规则包
    from mlxtend.frequent_patterns import association_rules

    #设置数据集
    data_set = [['牛奶', '洋葱', '肉豆蔻', '芸豆', '鸡蛋', '酸奶'],
                ['莳萝', '洋葱', '肉豆蔻', '芸豆', '鸡蛋', '酸奶'],
                ['牛奶', '苹果', '芸豆', '鸡蛋'], ['牛奶', '独角兽', '玉米', '芸豆', '酸奶'],
                ['玉米', '洋葱', '洋葱', '芸豆', '冰淇淋', '鸡蛋']]

    te = TransactionEncoder()
    #进行one-hot编码
    te_ary = te.fit(data_set).transform(data_set)
    # print(type(te_ary))
    df = pd.DataFrame(te_ary, columns=te.columns_)
    #利用apriori找出频繁项集
    freq = apriori(df, min_support=0.4, use_colnames=True)
    #计算关联规则
    result = association_rules(freq, metric="confidence", min_threshold=0.6)
    # 排序
    result.sort_values(by='confidence', ascending=False, axis=0)
    print(result)
    result.to_excel("./result.xlsx")
    return None
def eb_set1_association(eb_subset):

    # convert the dataframe to a list
    eb_list = eb_subset.astype(str).values.tolist()

    # encode the list to true or false transaction types
    # give a name to the encoder for easy access
    TransEncode = TransactionEncoder()
    # encode the list
    TE_arr = TransEncode.fit(eb_list).transform(eb_list)
    # change the list to a dataframe
    eb_df = pd.DataFrame(TE_arr, columns=TransEncode.columns_)

    # define a list of min support
    sup_list = [0.03, 0.05, 0.08]
    # apply apriori
    for sup in sup_list:
        freq_sets = apriori(eb_df, min_support=sup, use_colnames=True)
        freq_sets['Length'] = freq_sets['itemsets'].apply(lambda x: len(x))
        # get the most frequent itemset
        if sup == max(sup_list):
            most_freq_sets = freq_sets[(freq_sets['Length'] >= 2)
                                       & (freq_sets['support'] >= 0.09)]
            # print out the result
            print('********************')
            print('Most Frequent Set for Eventbrite Category')
            print(most_freq_sets['itemsets'])
            print('Support of most frequent set is ')
            print(most_freq_sets['support'])
Beispiel #16
0
def calc_fpgrowth(df, element, min_support):
    # 원-핫 인코딩
    te = TransactionEncoder()
    te_ary = te.fit(df).transform(df)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    print(df.head())
    # fpgrowth
    print("get frequent set by min support =", min_support / 100)
    frequent_itemsets = fpgrowth(df,
                                 min_support=min_support / 100,
                                 use_colnames=True,
                                 verbose=1)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(
        lambda x: len(x))
    frequent_itemsets['count'] = len(df) * frequent_itemsets['support']
    frequent_itemsets['count'] = frequent_itemsets['count'].apply(np.ceil)
    frequent_itemsets['count'] = frequent_itemsets['count'].astype('int')
    frequent_itemsets.sort_values(by=['support', 'length'],
                                  ascending=False,
                                  inplace=True)
    print(frequent_itemsets.head())
    # association rule
    rules = association_rules(frequent_itemsets,
                              metric="confidence",
                              min_threshold=0.01)
    rules['total_set'] = [
        frozenset.union(*X)
        for X in rules[['antecedents', 'consequents']].values
    ]
    #rules=rules[rules["consequents"]==frozenset(element)]
    rules = rules[~rules['consequents'].
                  apply(lambda x: x.isdisjoint(frozenset(element)))]
    #rules=rules[~rules["antecedents"].apply(lambda x : x.isdisjoint(frozenset(keyword)))]
    rules.sort_values(by=['confidence', 'antecedent support'],
                      ascending=False,
                      inplace=True)  # 지지도 : (동시 포함 수) / (전체 수)
    rules['count'] = len(df) * rules['support']
    rules['support'] = 100 * rules['support']
    rules['confidence'] = 100 * rules['confidence']
    rules['count'] = rules['count'].apply(np.ceil)
    rules['count'] = rules['count'].astype('int')
    rules['support'] = rules['support'].round(2)
    rules = rules.loc[:, [
        'antecedents', 'consequents', 'support', 'count', 'confidence',
        'total_set'
    ]]
    rules.columns = [
        '연관약품코드(전)', '연관약품코드(후)', '지지도(%)', '출현빈도', '연관도(%)', 'total_set'
    ]
    frequent_itemsets["total_set"] = frequent_itemsets["itemsets"]
    frequent_itemsets['support'] = frequent_itemsets['support'] * 100
    frequent_itemsets['support'] = frequent_itemsets['support'].round(2)
    frequent_itemsets = frequent_itemsets.loc[:, [
        'itemsets', 'support', 'count', 'length', 'total_set'
    ]]
    frequent_itemsets.columns = ['출현집합', '지지도(%)', '출현빈도', '품목개수', 'total_set']
    frequent_itemsets.reset_index(drop=True)
    rules.reset_index(drop=True)
    return frequent_itemsets, rules
Beispiel #17
0
def createFreqItems(data):
    te = TransactionEncoder()
    te_ary = te.fit(data).transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=0.001, use_colnames=True, max_len=5)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    frequent_itemsets['support'] = frequent_itemsets['support']
    return frequent_itemsets
def convert_to_matrix(db):
    transactions = []
    for i, row in db.iterrows():
        transactions.append(row["itemsets"])
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    basket_sets = pd.DataFrame(te_ary, columns=te.columns_)
    return basket_sets
 def get_dataset(self, ratings):
     transactions = [[movie_id for _, movie_id, _ in movies_ids]
                     for user_id, movies_ids in
                     groupby(ratings, key=itemgetter(0))]
     transaction_encoder = TransactionEncoder()
     one_hot = transaction_encoder.fit(transactions).transform(
         transactions)
     return pd.DataFrame(one_hot, columns=transaction_encoder.columns_)
Beispiel #20
0
 def return_support(self):
     print('Calculating support values...')
     te = TransactionEncoder()
     te_array = te.fit(self.list_sku).transform(self.list_sku)
     df_list = pd.DataFrame(te_array, columns=te.columns_)
     support = apriori(df_list, min_support=0.05, use_colnames=True)
     support = support.sort_values(['support'], ascending=False)
     support['length'] = support.apply(lambda x: len(x['itemsets']), axis=1)
     return support
Beispiel #21
0
def applyApriori(dataset, support):
    rec = dataset.values.tolist()
    te = TransactionEncoder()
    te_ary = te.fit(rec).transform(rec)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    from mlxtend.frequent_patterns import apriori
    freq_Itemsets = apriori(df, min_support=support, use_colnames=True)
    freq_Itemsets['length'] = freq_Itemsets['itemsets'].apply(lambda x: len(x))
    return (freq_Itemsets[freq_Itemsets['length'] > 1])
def get_frequent_set(total_i_list, appointed_output, my_refrigerator):
    def set_giver(sort_set_list: list):
        recommended_fequent_set = []

        for r_num in range(1, 5):
            recommended_name = []
            for num, i in enumerate(sort_set_list):
                if r_num == 1:
                    # if 判斷反著寫,不符合的就進入continue
                    if not ((appointed_output.issubset(i)) and (i.issubset(my_refrigerator))):
                        continue
                elif r_num == 2:
                    if not ((appointed_output.issubset(i)) and (round(sort_set.iloc[num, 0], 3) > 0.3)):
                        continue
                elif r_num == 3:
                    if not i.issubset(my_refrigerator):
                        continue
                else:
                    pass

                # 只有通過前面考驗的set會被append
                '''
                i: 關聯配對
                i & my_refrigerator - appointed_output: 找出的配對有除了input以外的其他食材(冰箱內的)
                i - my_refrigerator: 配對中有使用者沒有的
                appointed_output - i:提供的配對有非input的食材
                '''
                set_info = [sort_set.index[num], round(sort_set.iloc[num, 0], 3), i,
                            i & my_refrigerator - appointed_output, i - my_refrigerator, appointed_output - i,r_num]
                recommended_fequent_set.append(set_info)
                recommended_name.append(i)

                # 湊齊5個就回傳
                if len(recommended_fequent_set) == 5:
                    return recommended_fequent_set

            for selected_set in recommended_name:
                sort_set_list.remove(selected_set)

        # 不管最後找到幾個結果都回傳
        return recommended_fequent_set

    te = TransactionEncoder()
    te_ary = te.fit(total_i_list).transform(total_i_list)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
    if frequent_itemsets["itemsets"].count() < 2:
        frequent_itemsets = apriori(df, min_support=1/(df.count()[0]-1), use_colnames=True)
    # print(f"{frequent_itemsets.count()}")
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    output_set = frequent_itemsets[(frequent_itemsets['length'] >= len(appointed_output) + 1)]
    # print(output_set.count())
    sort_set = output_set.sort_values(['support'], ascending=False)

    recommended_fequent_set = set_giver(sort_set['itemsets'].tolist())

    return recommended_fequent_set
Beispiel #23
0
def get_df_items(itemsets):
    '''Creates a one-hot encoded dataframe of the given itemsets'''
    transaction_encoder = TransactionEncoder()
    transaction_encoded_ary = transaction_encoder.fit(itemsets).transform(
        itemsets)
    #Dataframe
    df = pd.DataFrame(transaction_encoded_ary,
                      columns=transaction_encoder.columns_)
    return df
Beispiel #24
0
def apply_apriori(file,mins):
	df = read(file)
	te = TransactionEncoder()
	te_ary = te.fit(df).transform(df)
	df = pd.DataFrame(te_ary, columns=te.columns_)

	x = apriori(df, min_support=0.2, use_colnames=True)

	print x['itemsets']
def solution():
    data = pd.read_csv('res/Retail.csv')
    #print(data.head(10))
    print('Total data shape:', data.shape)
    print('Unscanned Items shape:',
          data[data['Dept'] == '0999:UNSCANNED ITEMS'].shape)
    data.drop(data.loc[data['Dept'] == '0999:UNSCANNED ITEMS'].index,
              inplace=True)
    print('Data shape after dropping unscanned items:', data.shape)
    res1_candy = (data[data['Dept'] == '0973:CANDY'].shape)[0]
    print("number of times ‘0973:CANDY’ sold:", res1_candy)

    #df = data.groupby('POS Txn')
    #print(dataset.head())

    transaction_list = []
    for i in data['POS Txn'].unique():
        tlist = list(set(data[data['POS Txn'] == i]['Dept']))
        if len(tlist) > 0:
            transaction_list.append(tlist)

    te = TransactionEncoder()
    te_ary = te.fit(transaction_list).transform(transaction_list)
    df2 = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df2, min_support=0.02, use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric='lift',
                              min_threshold=2)
    sup_df = rules.sort_values('support', ascending=False).reset_index()
    res2_maxsupport = round(sup_df['support'][0], 5)
    #print(sup_df.iloc[:5,:6])
    #print(sup_df.iloc[:-5,:6])
    print(res2_maxsupport)

    print('Rules shape:', rules.shape)
    res3_totrules = rules.shape[0]
    print(res3_totrules)

    fildf = rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.1)]
    print('Filtered Rules shape:', fildf.shape)
    res4_filrules = fildf.shape[0]
    print(res4_filrules)

    #print(rules)

    # Creating a list of the answer
    result = [res1_candy, res2_maxsupport, res3_totrules, res4_filrules]
    print('Final Result:', result)
    # NOTE: Here 100, 0.54321, 40, 20 are the answer of 1st, 2nd, 3rd and 4th question respectively. Change it accordingly.

    # Finally create a dataframe of the final output  and write the output to output.csv

    result = pd.DataFrame(result)
    # writing output to output.csv
    result.to_csv('output/output.csv', header=False, index=False)
Beispiel #26
0
def question_1():
    # (a)
    print("----Part a----")
    df = pd.read_csv('Groceries.csv')
    # count unique items in each customer
    data = df.groupby(['Customer'])['Item'].count()
    print(len(data))
    # histogram
    plt.hist(data)
    plt.xlabel('Unique Items')
    plt.ylabel('Count')
    plt.title("Histogram of unique items")
    plt.show()
    # get 25,50,75 percentile
    quarts = np.percentile(data, [25, 50, 75])
    print(f"25%: {quarts[0]}, 50%: {quarts[1]}, 75%: {quarts[2]}")
    
    # (b)
    print("----Part b----")
    # group by items, and make it into a list of lists
    data_items = df.groupby(['Customer'])['Item'].apply(list).values.tolist()
    # apriori alg
    te = TransactionEncoder()
    te_ary = te.fit(data_items).transform(data_items)
    item_indicators = pd.DataFrame(te_ary, columns = te.columns_)
    frequent_item_sets = apriori(item_indicators, 
                                 min_support = 75 / len(data_items),
                                 use_colnames = True, max_len = None) # how to determine max_len?
    total_item_sets = len(frequent_item_sets)
    print(f"{total_item_sets} itemsets")
    largest_k = len(frequent_item_sets['itemsets'][total_item_sets - 1])
    print(f"Largest k: {largest_k}")
    
    # (c)
    print("----Part c----")
    ass_rules = association_rules(frequent_item_sets, metric = "confidence",
                                  min_threshold = 0.01)
    print(f"{len(ass_rules)} Association rules")
    
    # (d)
    print("----Part d----")
    plt.scatter(ass_rules['confidence'], ass_rules['support'], 
                c = ass_rules['lift'], s = ass_rules['lift'])
    plt.xlabel("Confidence")
    plt.ylabel("Support")
    plt.title("Support vs Confidence")
    color_bar = plt.colorbar()
    color_bar.set_label("Lift")
    plt.show()
    print("Just a graph for this part")
    
    # (e)
    print("----Part e----")
    ass_rules_e = association_rules(frequent_item_sets, metric = "confidence",
                                    min_threshold = 0.6)
    print(ass_rules_e.to_string())
Beispiel #27
0
def associationRule(df):
    """
    :param df: Input the Gross dataframe - part2cleanedGrosses.csv
    :return: Print out and save the itemsets for three different support values and calculate the confidence.
    """

    # Generating the itemsets
    df['week_ending'] = pd.to_datetime(
        df['week_ending']).dt.strftime('%Y-%m-%d')
    gross_date = [
        x for x in gross['week_ending'].unique() if int(x[0:4]) > 2000
    ]
    all_date = df['week_ending'].unique()

    itemset = []
    for i in all_date:
        temp = list(
            gross.loc[(df['week_ending'] == i) & df['percent_of_cap'] >= 0.8,
                      'show'])
        itemset.append(temp)

    itemset2 = []
    for i in gross_date:
        temp = list(
            gross.loc[(df['week_ending'] == i) & df['percent_of_cap'] >= 0.8,
                      'show'])
        itemset2.append(temp)

    # Perform Apriori algorithm
    for j in range(0, 2):
        temp = [itemset, itemset2][j]
        te = TransactionEncoder()
        te_ary = te.fit(temp).transform(temp)
        temp_df = pd.DataFrame(te_ary, columns=te.columns_)
        value = [0.4, 0.6, 0.8]  # support value

        confidf = pd.DataFrame()
        supportdf = pd.DataFrame()
        for i in value:
            frequent_itemsets = apriori(temp_df,
                                        min_support=i,
                                        use_colnames=True)
            confi = association_rules(frequent_itemsets,
                                      metric="confidence",
                                      min_threshold=0.7)
            frequent_itemsets['support_val'] = i
            confi['support_val'] = i
            supportdf = supportdf.append(frequent_itemsets)
            confidf = confidf.append(confi.iloc[:, [0, 1, 5, -1]])
            print('#################### Support =', i, '####################')
            pprint.pprint(frequent_itemsets)
            print('####### Calculating the Confidence #######')
            print(confi.iloc[:, [0, 1, 5, -1]])
        supportdf.to_csv(str('Itemset_support' + str(j) + '.csv'), index=False)
        confidf.to_csv(str('Itemset_confidence' + str(j) + '.csv'),
                       index=False)
Beispiel #28
0
def supportCount(item_sets):
    te = TransactionEncoder()
    te_ary = te.fit(item_sets).transform(item_sets)
    te_ary = te_ary.astype("int")
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

    print('{Itemsets} ----> support')
    for index, row in frequent_itemsets.iterrows():
        print(set(row['itemsets']), '----->', round(row['support'], 3))
def getBooleanDF(property_list):
    """
    Transform the nested list into a boolean dataframe with transactions on rows and items on columns
    :param property_list: The nested list with the wikidata properties
    :return: A boolean dataframe
    """
    te = TransactionEncoder()
    te_ary = te.fit(property_list).transform(property_list)
    boolean_dataframe = pd.DataFrame(te_ary, columns=te.columns_)
    return boolean_dataframe
Beispiel #30
0
 def applyAprioriTopic(self, support: float) -> pd.DataFrame:
     processor = TransactionEncoder()
     binary = processor.fit(self.rawTopic).transform(self.rawTopic)
     return association_rules(apriori(pd.DataFrame(
         binary, columns=processor.columns_),
                                      min_support=support,
                                      use_colnames=True,
                                      low_memory=True),
                              metric='confidence',
                              min_threshold=0.8)
Beispiel #31
0
    def unitfiy_sample_dataset(self):
        start = time.perf_counter()
        print("开始进一步规约样本数据集")

        te = TransactionEncoder()  # 定义模型
        te_ary = te.fit(self.sampleList).transform(self.sampleList)
        self.sample_df = pd.DataFrame(te_ary, columns=te.columns_)

        elapsed = (time.perf_counter() - start)
        print("Time used:", elapsed)
        print("样本数据集已进一步规约完毕")
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


# In[2]:


fin = open("T10I4D100K.txt", "r")
dataset = [[int(n) for n in line.split()] for line in fin]


# In[3]:


te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset, sparse=True)
sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)
sparse_df


# In[4]:


frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True)
frequent_itemsets5


# In[5]:


frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True)
def test_fit():
    oht = TransactionEncoder()
    oht.fit(dataset)
    assert(oht.columns_ == ['Apple', 'Bananas', 'Beer',
                            'Chicken', 'Milk', 'Rice'])
def test_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset)
    np.testing.assert_array_equal(expect, trans)
def test_transform_sparse():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset, sparse=True)
    assert(isinstance(trans, csr_matrix))
    np.testing.assert_array_equal(expect, trans.todense())
def test_inverse_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    np.testing.assert_array_equal(np.array(data_sorted),
                                  np.array(oht.inverse_transform(expect)))