Beispiel #1
0
def event_apriori_data(event_id, support):

    support = float(support)
    data = db.events
    lists = data.find({'eventId': event_id})
    lists = list(lists)

    if (len(lists) != 0):

        df1 = pd.DataFrame(lists)
        item = []
        for i in range(0, len(df1.triggers[0])):
            text = df1.triggers[0][i]['data']
            text = text.split(',')
            item.append(text)

        te = TransactionEncoder()
        te_arry = te.fit_transform(item)
        df2 = pd.DataFrame(te_arry, columns=te.columns_)
        frq_item = apriori(df2, min_support=support, use_colnames=True)
        rule = association_rules(frq_item,
                                 metric='confidence',
                                 min_threshold=0.6)

        return rule.to_json(orient='records')

    else:
        return "No Data"
    def apply_(self):
        df_ = self.frequent_patterns_prepare(min_threshold=1000)
        te = TransactionEncoder()  # 对数据集进行TransactionEncoder编码
        df_tf = te.fit_transform(df_.values)

        df = pd.DataFrame(df_tf, columns=te.columns_)

        start = time()
        # 寻找频繁项集
        frequent_itemsets = fpgrowth(df, min_support=0.05, use_colnames=True)
        logging.debug('寻找频繁项集算法时耗:%s\n' % (time() - start))
        print('寻找频繁项集算法时耗:', time() - start)
        print()

        frequent_itemsets.sort_values(by='support',
                                      ascending=False,
                                      inplace=True)
        logging.debug(f'freqSet:\n{frequent_itemsets}\n')
        print(f'freqSet:\n{frequent_itemsets}')
        print('\n\n', '**' * 30)

        # 生成关联规则
        association_rule = association_rules(frequent_itemsets,
                                             metric='confidence',
                                             min_threshold=0.7)  # 指标为置信度
        association_rule.sort_values(by='leverage',
                                     ascending=False,
                                     inplace=True)  # 关联规则按leverage排序

        logging.debug('关联规则:\n{}'.format(association_rule))
        print('关联规则:\n{}'.format(association_rule))
Beispiel #3
0
def rule():
    data = get_data()
    te = TransactionEncoder()
    te_ary = te.fit_transform(data)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric='confidence',
                              min_threshold=0.2)
    return frequent_itemsets, rules
def rule():
    data = pd.read_csv('shopping_data.csv', header=None)
    df_arr = data.apply(deal,axis=1).tolist()
    
    te = TransactionEncoder()  # ????
    te_ary = te.fit_transform(df_arr)  # ?????
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
    
    association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

    return frequent_itemsets, association_rule # ?????????????? DataFrame
Beispiel #5
0
def transaction_encoder(transaction):
    """
    读入交易记录,转化为0-1编码交易记录
    
    Parameters:
        transaction (二维list): 交易记录
        
    Returns:
        encoded_transaction (DataFrame): 0-1编码的交易记录
    """
    te = TransactionEncoder()  # 定义模型
    df_tf = te.fit_transform(transaction)
    encoded_transaction = pd.DataFrame(df_tf, columns=te.columns_)
    return encoded_transaction
def fp_growth_retail(TOP_PERCENTAGE, file_name, no_of_trx):
    data = pd.read_csv('../Datasets/' + str(file_name) + '.csv', header=None)

    print("\n --- FP Growth on File " + str(file_name) +
          " : and Top Percentage: " + str(TOP_PERCENTAGE))
    # converting into required format of TransactionEncoder()
    trans = []
    for i in range(0, no_of_trx):
        trans.append([str(data.values[i, j]) for j in range(0, 20)])

    Items = dict(collections.Counter([x for sublist in trans
                                      for x in sublist]))
    Items['nan'] = 0
    print("Frequencies of Each Item:")
    print(Items)

    top_items = top_x_per_products(Items, TOP_PERCENTAGE)
    print("Top Items:")
    print(top_items)

    plot_graph(top_items, 'fp_growth', TOP_PERCENTAGE)

    Output = [b for b in trans if any(a in b for a in top_items.keys())]

    # Using TransactionEncoder
    trans = np.array(trans)

    Output = np.array(Output)
    # print(Output.shape)

    t = TransactionEncoder()
    data = t.fit_transform(Output)
    data = pd.DataFrame(data, columns=t.columns_, dtype=int)

    # print(data.shape)
    # here we also find nan as one of the columns so lets drop that column

    data.drop('nan', axis=1, inplace=True)
    # print(data.shape)
    # print(data.head())

    # running the fpgrowth algorithm
    res = fpgrowth(data, min_support=0.01, use_colnames=True)
    print("Number of Frequent Item sets:" + str(len(res)))
    res = association_rules(res, metric="confidence", min_threshold=0.5)
    print("\n=============== ASOCIATION RULES ======================")

    cols = [0, 1, 4, 5]
    res = res[res.columns[cols]]
    print(res)
Beispiel #7
0
def rule():

    df = pd.read_csv("shopping_data.csv", header=None)
    dataset = df.stack().groupby(level=0).apply(list).tolist()

    te = TransactionEncoder()  # 定义模型
    te_ary = te.fit_transform(dataset)  # 转换数据集
    df = pd.DataFrame(te_ary, columns=te.columns_)  # 将数组处理为 DataFrame

    frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
    association_rules = rules(frequent_itemsets,
                              metric="confidence",
                              min_threshold=0.2)  # 置信度阈值为 0.1

    return frequent_itemsets, association_rules
def rule3():
    from mlxtend.frequent_patterns import fpgrowth
    from mlxtend.frequent_patterns import association_rules
    from mlxtend.preprocessing import TransactionEncoder
    now = time.time()
    te = TransactionEncoder()
    te_ary = te.fit_transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=0.03, use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=0.2)
    print("频繁项集:", frequent_itemsets)
    print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.2)])
    print("用时:", time.time() - now)
Beispiel #9
0
def encode_orders_materials(orders):
    orders_grouped = orders[['order_id', 'material']].groupby('order_id')

    orders_materials = [
        list(orders_group.material) for (_, orders_group) in orders_grouped
    ]

    encoder = TransactionEncoder()
    orders_materials = encoder.fit_transform(orders_materials, sparse=True)

    orders_index = list(orders_grouped.groups.keys())
    orders_columns = [str(column) for column in encoder.columns_]

    return pandas.DataFrame.sparse.from_spmatrix(orders_materials,
                                                 index=orders_index,
                                                 columns=orders_columns)
Beispiel #10
0
    def unitfiy_sample_dataset(self):
        start = time.perf_counter()
        print("开始进一步规约样本数据集")

        shopping_df = pd.DataFrame(self.sampleList)
        df_arr = shopping_df.stack().groupby(level=0).apply(list).tolist()  # 方法一
        # df_arr = shopping_df.apply(self.deal,axis=1).tolist()		        # 方法二
        te = TransactionEncoder()  # 定义模型
        df_tf = te.fit_transform(df_arr)
        # df_01 = df_tf.astype('int')			# 将 True、False 转换为 0、1 # 官方给的其它方法
        # df_name = te.inverse_transform(df_tf)		# 将编码值再次转化为原来的商品名
        self.sample_df = pd.DataFrame(df_tf, columns=te.columns_)

        elapsed = (time.perf_counter() - start)
        print("Time used:", elapsed)
        print("样本数据集已进一步规约完毕")
def rule():

    df = pd.read_csv('shopping_data.csv')
    dataset = df.stack().groupby(level=0).apply(list).tolist()

    te = TransactionEncoder()  
    te_ary = te.fit_transform(dataset)

    data = pd.DataFrame(te_ary, columns=te.columns_)
                        
    frequent_itemsets = apriori(data, min_support=0.05, use_colnames=True)

    association_rules = rules(
            frequent_itemsets, metric="confidence", min_threshold=0.2)

    return frequent_itemsets, association_rules
Beispiel #12
0
def ele(date_start, date_end, asset_id, support):

    support = float(support)
    date_start = parser.parse(date_start, dayfirst=True)
    date_end = parser.parse(date_end, dayfirst=True)

    data = db.reports
    lists = data.find({
        'date': {
            '$gt': date_start,
            '$lt': date_end
        },
        'assetId': asset_id
    })
    lists = list(lists)

    if (len(lists) != 0):

        df = pd.DataFrame(lists)
        data_ele = []

        for i in range(0, len(df.elements)):
            data_ele.append(df.elements[i])

        for i in range(0, len(data_ele)):
            for j in range(0, len(data_ele[i])):
                if data_ele[i][j] is None:
                    data_ele[i][j] = "other"

        te = TransactionEncoder()

        te_arry = te.fit_transform(data_ele)
        df1 = pd.DataFrame(te_arry, columns=te.columns_)
        frq_item = apriori(df1, min_support=support, use_colnames=True)
        rule = association_rules(frq_item,
                                 metric='confidence',
                                 min_threshold=0.5)

        return rule.to_json(orient='records')

    else:
        return "No Data"
Beispiel #13
0
def rule():
    df_data=pd.read_csv('shopping_data.csv',header=None)
    dataset=[]
    for i in range(len(df_data)):
        list_data=list(df_data.loc[i])
        list_no_nan=[]
        for j in range(len(list_data)):
            if isinstance(list_data[j],float):
                break
            else:
                list_no_nan.append(list_data[j])
        dataset.append(list_no_nan)

    te = TransactionEncoder() 
    te_ary = te.fit_transform(dataset) 
    df_te = pd.DataFrame(te_ary, columns=te.columns_)  
    frequent_itemsets = apriori(df_te, min_support=0.05, use_colnames=True)
    association_rules_df=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)
    print(frequent_itemsets)
    print(association_rules_df)
    return frequent_itemsets,association_rules_df
Beispiel #14
0
def generate_rules(df):
    df['cat_suicides_num'] = pd.cut(df['num_of_suicides'], bins=6)
    df['cat_suicides/100k'] = pd.cut(df['suicides/100k'], bins=6)
    df['cat_population'] = pd.cut(df['population'], bins=6)
    df['cat_gdp_for_year'] = pd.cut(df['gdp_for_year'], bins=6)
    df['cat_gdp_per_capita'] = pd.cut(df['gdp_per_capita'], bins=6)

    df.drop([
        'num_of_suicides', 'population', 'suicides/100k', 'gdp_for_year',
        'gdp_per_capita'
    ],
            axis=1,
            inplace=True)

    # for i in df['continent'].unique():
    #     df_continent = df.loc[df.continent == i]
    #     df_continent = df_continent.drop(['continent'], axis=1)

    # print('start: ', datetime.now().time())
    trans = []
    for y in range(0, df.shape[0]):
        trans.append([str(df.values[y, j]) for j in range(0, df.shape[1])])

    # print('just transformed the dataset into array: ', datetime.now().time())

    te = TransactionEncoder()
    data = te.fit_transform(trans)
    data = pd.DataFrame(data, columns=te.columns_)
    print(data)

    frequent_items = apriori(data, min_support=0.5, use_colnames=True)
    print(frequent_items)

    rules = association_rules(frequent_items,
                              metric="confidence",
                              min_threshold=0.5)
    print(rules)

    # print('finished mining: ', datetime.now().time())
    rules.to_csv('generated_rules.csv')
    def data_for_apriori(self):
        # rating_data
        rating_df = pd.read_pickle(self.rating_data_path)

        # 4점 이상의 고평점 영화만 사용
        over_4_rating = rating_df[rating_df['rating'] >= 4]
        user_movie_basket = over_4_rating.groupby('user_id')['movie_id'].apply(
            set)

        # basket -> vector
        transaction = TransactionEncoder()
        basket_array = transaction.fit_transform(user_movie_basket)

        basket_df = pd.Dataframe(basket_array, columns=transaction.columns_)

        # 평점 개수 기준 상위 5000개
        top_5000_movie= rating_df.groupby('movie_id')['rating'].count().sort_values(ascending=False).\
            iloc[:5000].index
        top_5000_basket = basket_df[top_5000_movie]
        top_5000_basket = top_5000_basket[top_5000_basket.sum(axis=1) > 0]

        return top_5000_basket
Beispiel #16
0
def data_transform():
    # 导入数据并进行条件过滤
    df = pd.read_excel('./销售基础表查询.xlsx', sheet_name='销售基础表查询', header=0)
    print('原始数据: {}'.format(df.shape))
    bool_content = ((df['实销数量'] > 0) & (df['实销金额'] > 0))
    df = df[bool_content]
    print('数据过滤后: {}'.format(df.shape))
    # 选择需要的数据字段
    df2 = df[['单据号', '商品']]
    print('提取字段后: {}'.format(df2.shape))
    # 数据整合 —— 唯一单据号对应商品
    df3 = pd.DataFrame([(i, df2[df2['单据号'] == i]['商品'].tolist()) for i in df2['单据号'].unique()])
    print('数据整合后: {}'.format(df3.shape))
    # 生成购物篮对应商品列表
    df_arr = df3[1].tolist()
    print('购物篮对应商品列表: {}'.format(len(df_arr)))
    # 调用模型
    te = TransactionEncoder()
    df_tf = te.fit_transform(df_arr)
    # 生成数据集
    df4 = pd.DataFrame(df_tf, columns=te.columns_)
    print('转换完成: {}'.format(df4.shape))
    # 返回数据
    return df4
Beispiel #17
0
def data_transform():
    # 导入数据并进行条件过滤
    df = pd.read_excel('./销售基础表查询.xlsx', sheet_name='销售基础表查询', header=0)
    # print(df.head())
    # print(df.shape)
    bool_content = ((df['实销数量'] > 0) & (df['实销金额'] > 0))
    df = df[bool_content]
    # print(df.head())
    print('df: {}'.format(df.shape))
    # 选择需要的数据字段
    df2 = df[['单据号', '商品']]
    print('df2: {}'.format(df2.shape))
    # 数据整合 —— 唯一单据号对应商品
    df3 = pd.DataFrame([(i, df2[df2['单据号'] == i]['商品'].tolist()) for i in df2['单据号'].unique()])
    print('df3: {}'.format(df3.shape))
    # 生成购物篮对应商品列表
    # shopping_lists = []
    # for shopping_list in df3[1]:
    #     shopping_lists.append(shopping_list)
    # shopping_df = pd.DataFrame(shopping_lists)
    # print(shopping_df.head(20))
    # 剔除数据中的空值( apply )
    # df_arr = shopping_df.apply(deal, axis=1).tolist()
    # print('df_arr: {}'.format(len(df_arr)))
    # print(df_arr[:21])
    df_arr = df3[1].tolist()
    print('df_arr: {}'.format(len(df_arr)))
    # 调用模型
    te = TransactionEncoder()
    df_tf = te.fit_transform(df_arr)
    # 生成数据集
    df4 = pd.DataFrame(df_tf, columns=te.columns_)
    print('df4: {}'.format(df4.shape))
    # print(df4.head(20))
    # 返回数据
    return df4
Beispiel #18
0
import matplotlib.pyplot as plt

all_data = pd.read_csv('dataset_group.csv', header=None)
print(all_data)

unique_id = all_data[1].unique()
print(unique_id.size)

items = all_data[2].unique()
print(items.size)

dataset = [[elem for elem in all_data[all_data[1] == id][2] if elem in items]
           for id in unique_id]

te = TransactionEncoder()
te_ary = te.fit_transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df)

# 1
results = apriori(df, min_support=0.3, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))
print(results)

results_orig = apriori(df, min_support=0.3, use_colnames=True, max_len=1)
results_orig['length'] = results_orig['itemsets'].apply(lambda x: len(x))
print(results_orig)

results = apriori(df, min_support=0.3, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))
results_2 = results[results['length'] == 2]
def test_fit_transform():
    oht = TransactionEncoder()
    trans = oht.fit_transform(dataset)
    np.testing.assert_array_equal(expect, trans)
    "D:\\Machine Learning_Algoritms\\Apriori\\GroceryStoreDataSet.csv",
    encoding='latin1',
    names=['products'],
    header=None)
num_records = len(Dataframe)
print(num_records)

transactions = []
for i in range(0, num_records):
    transactions.append([str(Dataframe.values[i, j]) for j in range(0, 3)])

Dataframe = list(Dataframe["products"].apply(lambda x: x.split(',')))

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_data = te.fit_transform(Dataframe)

Dataframe = pd.DataFrame(te_data, columns=te.columns_)

count = Dataframe.loc[:, :].sum()
reverse_count = count.sort_values(0, ascending=False).head(11)
reverse_count = reverse_count.to_frame()
reverse_count = reverse_count.reset_index()
#reverse_count = reverse_count.rename(columns = {“index”: “items” ,0: “count”})

plt.style.available

plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('dark_background')
ax = reverse_count.plot.barh()
plt.title("Popular items")
import youtube_process
from mlxtend.preprocessing import TransactionEncoder
file_US = "USvideos.csv"
US_data = pd.read_csv(file_US, keep_default_na=False, low_memory=False)
US_data
df = US_data[['category_id','views']]
df
with open("US_category_id.json", 'r') as f:
    content = json.load(f)
category_map = {}
for i in content['items']:
    category_map[int(i['id'])] = i['snippet']['title']
category_map
t = df['category_id'].map(category_map)
df = pd.concat([df,t],axis=1)
df.columns=['category_id','views','category']
grade = []
for i in df['views'].values:
    views_map = lambda x:{x>=4194399:'A',1823157<=x<4194399:'B',681861<=x<1823157:'C',
                          242329<=x<681861:'D',549<=x<242329:'E'}
    grade.append(views_map(i)[True])
df['views_grade'] = grade
df = df.drop(['category_id', 'views'], axis = 1)
df
def deal(data):
    return data.dropna().tolist()
df_arr = df.apply(deal,axis=1).tolist() # 转化成列表
TE = TransactionEncoder()  # 定义模型
df_tf = TE.fit_transform(df_arr)
df = pd.DataFrame(df_tf,columns=TE.columns_)
df
Beispiel #22
0
	        ['莴苣','豆奶','尿布','葡萄酒'],
	        ['莴苣','豆奶','尿布','橙汁']]
 
shopping_df = pd.DataFrame(shopping_list)


def deal(data):
	return data.dropna().tolist()
df_arr = shopping_df.apply(deal,axis=1).tolist()	


"""由于mlxtend的模型只接受特定的数据格式。(TransactionEncoder类似于独热编码,每个值转换为一个唯一的bool值)"""
from mlxtend.preprocessing import TransactionEncoder	# 传入模型的数据需要满足特定的格式,可以用这种方法来转换为bool值,也可以用函数转换为0、1
 
te = TransactionEncoder()	# 定义模型
df_tf = te.fit_transform(df_arr)
# df_01 = df_tf.astype('int')			# 将 True、False 转换为 0、1 # 官方给的其它方法
# df_name = te.inverse_transform(df_tf)		# 将编码值再次转化为原来的商品名
df = pd.DataFrame(df_tf,columns=te.columns_)

"""求频繁项集:

导入apriori方法设置最小支持度min_support=0.05求频繁项集,还能选择出长度大于x的频繁项集。
"""
from mlxtend.frequent_patterns import apriori
 
frequent_itemsets = apriori(df,min_support=0.05,use_colnames=True)	# use_colnames=True表示使用元素名字,默认的False使用列名代表元素
# frequent_itemsets = apriori(df,min_support=0.05)
frequent_itemsets.sort_values(by='support',ascending=False,inplace=True)	# 频繁项集可以按支持度排序
print('求频繁项集')
print(frequent_itemsets[frequent_itemsets.itemsets.apply(lambda x: len(x)) >= 2])  # 选择长度 >=2 的频繁项集
from datetime import datetime
import json
global rules
data = pd.read_csv('groceryinfo.csv', header=None)
#print(data)
records = []
for i in range(0, 7501):
    records.append([str(data.values[i, j]) for j in range(0, 20)])
removed_records = []
for row in records:
    row = list(filter(lambda a: a != 'nan', row))
    row = list(filter(lambda a: a != 'mineral water', row))
    removed_records.append(row)

te = TransactionEncoder()
data = te.fit_transform(removed_records)
data = pd.DataFrame(data, columns=te.columns_)

from mlxtend.frequent_patterns import apriori, association_rules

frq_items = apriori(data, min_support=0.004, use_colnames=True)
rules = association_rules(frq_items, metric="confidence", min_threshold=0.2)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(len(rules))

global products
products = {
    "olive oil":
    'https://images-na.ssl-images-amazon.com/images/I/71JLJ0MQT8L._SY679_.jpg',
    "frozen vegetables":
    'https://images-na.ssl-images-amazon.com/images/I/81Dxf-0CzwL._SL1500_.jpg',
Beispiel #24
0
# məhsulları ayrı ayrılıqda list formatına çevirmək

transactions = list(items["mehsul_ad"].transform(lambda x: x.split(";")))

# In[12]:

transactions[0]

# In[13]:

# məhsulları pivot formatında səbətə çevirmək: hər bir sətirdə(səbətdə) hansı məhsulların olub olmaması

from mlxtend.preprocessing import TransactionEncoder

tr_enc = TransactionEncoder()
basket = pd.DataFrame(tr_enc.fit_transform(transactions),
                      columns=tr_enc.columns_)

# In[14]:

basket

# In[15]:

# səbət analizi üçün lazım olan funksiyalar

from mlxtend.frequent_patterns import apriori, association_rules

# In[16]:

# məhsulların ayrı ayrılıqda və birlikdə səbətlərdə görünmə dərəcəsi
Beispiel #25
0
"""
import csv
 
dict = dkey
w = csv.writer(open("output.csv", "w"))
for key, val in dict.items():
  w.writerow([key, val])     
        
    """
    
# To create a list of lists from the dictionary values 
i=10002    
while i in range(10002,42580):
    dkey[i]=list(map(str,dkey[i]))
we=list(dkey.values())

#Fitting the association rule learning model
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
dat=we[1:50]
te_ary = te.fit_transform(dat,sparse=False)
df = pd.DataFrame(te_ary, columns=te.columns_)
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

print (frequent_itemsets)

from mlxtend.frequent_patterns import association_rules
t=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
Beispiel #26
0
from apyori import apriori
rules = apriori(symptoms, min_support=0.1, min_confidence=0.7)
results = list(rules)

for i in results:
    for j in i.ordered_statistics:
        X = j.items_base
        Y = j.items_add
        x = ', '.join([item for item in X])
        y = ', '.join([item for item in Y])
        if x != '':
            print(x + ' → ' + y)

from mlxtend.preprocessing import TransactionEncoder
TE = TransactionEncoder()
data = TE.fit_transform(symptoms)
print(data)

import pandas as pd
df = pd.DataFrame(data, columns=TE.columns_)
df.head()

from mlxtend.frequent_patterns import apriori
items = apriori(df, min_support=0.1, use_colnames=True)
print(items)

print(items[items['itemsets'].apply(lambda x: len(x)) >= 2])


from mlxtend.frequent_patterns import association_rules
rules = association_rules(items, min_threshold=0.7)
Beispiel #27
0
def Mlx(itemsets, minimumSup):
    te = TransactionEncoder()  # 定義模型
    df_tf = te.fit_transform(itemsets)
    df = pd.DataFrame(df_tf, columns=te.columns_)
    return apriori(df, min_support=minimumSup, use_colnames=True)
Beispiel #28
0
# %%
items = [
    'whole milk', 'yogurt', 'soda', 'tropical fruit', 'shopping bags',
    'sausage', 'whipped/sour cream', 'rolls/buns', 'other vegetables',
    'root vegetables', 'pork', 'bottled water', 'pastry', 'citrus fruit',
    'canned beer', 'bottled beer'
]
np_data_new = all_data.to_numpy()
np_data_new = [[
    elem for elem in row[1:] if isinstance(elem, str) and elem in items
] for row in np_data_new]

# %%
te_new = TransactionEncoder()
te_ary_new = te_new.fit_transform(np_data_new)
data_new = pd.DataFrame(te_ary_new, columns=te_new.columns_)
data_new

# %%
fpg_result_new = fpgrowth(data_new, min_support=0.03,
                          use_colnames=True).sort_values('support',
                                                         ascending=False)
fpg_result_new

# %%
fpm_result_new = fpmax(data_new, min_support=0.03,
                       use_colnames=True).sort_values('support',
                                                      ascending=False)
fpm_result_new
Beispiel #29
0
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import requests
file = open('/root/PycharmProjects/DATA_MINING/groceries.csv', 'w')
data = requests.get(
    'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv'
)
file.write(data.text)
file.close()
file = open('/root/PycharmProjects/DATA_MINING/groceries.csv', 'r')
lines = file.readlines()
data = [[x if x[len(x) - 1] != '\n' else x[:len(x) - 1] for x in y.split(',')]
        for y in lines]
print(data)
encoder = TransactionEncoder()
data = encoder.fit_transform(data)
print(data)
data = data.astype('int')
data = pd.DataFrame(data, columns=encoder.columns_)
print(data)

print(" Minimum support 5 % and confidence 9 % ")
frq_items = apriori(data, min_support=0.05, use_colnames=True)
rules = association_rules(frq_items, metric="confidence", min_threshold=0.09)
print(rules)

print(" Minimum support 7 % and confidence 10 % ")
frq_items = apriori(data, min_support=0.07, use_colnames=True)
rules = association_rules(frq_items, metric="confidence", min_threshold=0.10)
print(rules)
Beispiel #30
0
        if freq[i] in txt:
            res.append(freq[i])
    if len(set(res)) == 0:
        return (set(freq))
    elif len(set(res)) < 15:
        for j in range(len(freq)):
            if len(set(res)) < 15:
                res.append(freq[j])
            else:
                return set(res)
        return set(res)
    else:
        return set(res)


df = pd.read_csv("dataset/sample.csv")['Tags']
df = df.apply(lambda x: x.split(" "))
data = df.values
td = TransactionEncoder()
td_data = td.fit_transform(df)
df2 = pd.DataFrame(td_data, columns=td.columns_)

freq_data = apriori(df2, min_support=0.009, use_colnames=True)  #0.0004
freq_data['len'] = freq_data.itemsets.apply(lambda x: len(x))


def test():
    df = pd.read_csv("sample.csv").head(1)
    txt = df["Title"] + " " + df["Body"]
    return get_ferq_with_txt(txt, ["linux", "c#", "php"])
Beispiel #31
0
'''
【白话机器学习】算法理论+实战之关联规则
https://mp.weixin.qq.com/s/KXoKE0cY7hiJIA2hE86mDw
'''
# 一、直接可用数据集
data = [('牛奶', '面包', '尿布'), ('可乐', '面包', '尿布', '啤酒'), ('牛奶', '尿布', '啤酒', '鸡蛋'),
        ('面包', '牛奶', '尿布', '啤酒'), ('面包', '牛奶', '尿布', '可乐')]

# 1.1、第三方库:
from mlxtend.frequent_patterns import apriori as mlxtend_apriori, association_rules as mlxtend_association_rules
from mlxtend.preprocessing import TransactionEncoder

# TransactionEncoder是进行数据转换中的,需要先将上面的data数据转成宽表的形式,何谓宽表,下面的这种:
"""数据转换"""
transEn = TransactionEncoder()
oht_ary = transEn.fit_transform(data)
new_data = pd.DataFrame(oht_ary, columns=transEn.columns_)
# In[]:
print(new_data.iloc[0][0])
print(type(new_data.iloc[0][0]))
# In[]:
# 第一步:计算频繁项集,在这里可以定义最小支持度进行筛选频繁项集:
"""计算频繁项集"""
frequent_itemset = mlxtend_apriori(new_data,
                                   min_support=0.5,
                                   use_colnames=True)
frequent_itemset
# In[]:
# 第二步:挖取关联规则, 这里的 准则 可以使用 置信度(confidence) 或 提升度(lift)
rules = mlxtend_association_rules(frequent_itemset,
                                  metric='confidence',