Beispiel #1
0
import pandas as pd  # 필요 라이브러리 불러오기
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori

dataset = []
with open('./mart.csv', 'r') as reader:
    for line in reader:
        dataset.append(line.strip().split(','))
te = TransactionEncoder()  # 거래 정보를 bag of word형식으로 저장
te_ary = te.fit(dataset).transform(dataset)

# 거래 정보를 pandas 데이터프레임으로 저장
df = pd.DataFrame(te_ary, columns=te.columns_)

# apriori 알고리즘을 이용하여 의미있는 규칙 찾기
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets,
                          metric="confidence",
                          min_threshold=0.01)
print(rules)  # 규칙 보기
#读取文件
dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)
# 查看维度,shape为(7501,20)
print(dataset)

#将数据存放到transactions中
transactions = []
for i in range(0, dataset.shape[0]):
    temp = []
    for j in range(0, 20):
        if str(dataset.values[i, j]) != 'nan':
            temp.append(str(dataset.values[i, j]))
    transactions.append(temp)

# 对数据进行独热编码
temp = TransactionEncoder()
temp_hot_encoded = temp.fit_transform(transactions)
df = pd.DataFrame(temp_hot_encoded, columns=temp.columns_)
print(df.head())
df.to_csv('df.csv')

# 挖掘频繁项集
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support',
                                                  ascending=False)
print('频繁项集:', frequent_itemsets)
# 计算关联规则
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules = rules.sort_values(by='lift', ascending=False)
print('关联规则:', rules)
rules.to_csv('rules.csv')
Beispiel #3
0
def arms(orders, recipes, dept, products):

    print(
        'Association rule mining is used to find the most bought products and their relationship'
    )
    dept_of_interest = dept.query(
        'department == "produce" or department == "bakery" or department == "international" or department == "beverages" or department == "dry goods pasta" or department == "bulk" or department == "meat seafood" or department == "pantry" or department == "dairy eggs"'
    )  #fetching the 'Produce Department'

    #products = pd.read_csv(xls, 'products') # reading products file

    #fetching the 'Produce Department' id

    val = dept_of_interest[
        'department_id']  #val stores the department id for 'Produce Department'

    dept_of_interest  # shows department id

    filtered_products = products.loc[products['department_id'].isin(
        val)]  #getting products from the 'Produce Department'

    filtered_products = pd.Series(filtered_products['product_id']
                                  )  # filter products that have a products id

    filtered_products_list = filtered_products.tolist(
    )  # converting datafame to list

    # converting all the values in striing
    filtered_products_strlist = []
    for elem in filtered_products_list:
        filtered_products_strlist.append(str(elem))

    # converting values in product id column as string
    orders['product_id'] = orders['product_id'].astype(str)

    # printing orders
    orders

    filtered_orders = orders.loc[orders['product_id'].isin(
        filtered_products_strlist
    )]  #fetching orders which contain products from the filtered products list

    del filtered_orders[
        'add_to_cart_order']  # removing column add_to_cart_order

    del filtered_orders['reordered']  # removing reordered column

    product_id_name_map = products.loc[products['product_id'].isin(
        filtered_products_strlist)]

    product_id_name_map

    # geting product table where 'product id' matches with filtered_products_strlist

    # converts the value in string
    product_id_name_map['product_id'] = product_id_name_map[
        'product_id'].apply(str)
    product_id_name_map['product_id']

    filtered_orders['product_id'].apply(str)

    # merge the filtered orders table and  product_id_name_map table

    with pd.option_context('mode.chained_assignment', None):

        final_table = pd.merge(filtered_orders,
                               product_id_name_map,
                               on='product_id',
                               how='left')

    # final table contains order id mapped with product id , proudct name aisle id and department id
    final_table

    # group order id by order id and product name
    matrix = final_table.groupby(
        ['order_id'])['product_name'].apply(list).values.tolist()
    matrix

    # # Transform Dataframe

    #Change variables from here
    # create a table with true and false matrix for every transactio

    te = TransactionEncoder()
    te_ary = te.fit(matrix).transform(matrix)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    df

    # # Find products that are bought more than Five percent of the times

    support = apriori(df, min_support=0.05, use_colnames=True)
    print(
        'Products that are bought more than Five percent of the time (Support)'
    )
    print('')
    display(support)
    print('')

    # Below items should always be kept in stock

    # #  Find products that are bought one percent of the time

    print(
        'Products that are bought more than One percent of the time (Support) '
    )
    print('')
    frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(
        lambda x: len(x))
    display(frequent_itemsets)

    # # Products selected with more than one percent support and twenty percent confidence

    # Use the above data with products that are bought one percent of the times and select products that are bought in pairs i.e if one product is bought then the other product is also bought. Only give result for product that are bought more than 20 percent of the time together i.e that have a confidence of more than 20 percent
    print(
        'Products that are bought more than Twenty percent of the times together (Confidence)'
    )
    print('')
    products_association = association_rules(frequent_itemsets,
                                             metric="confidence",
                                             min_threshold=0.20)
    display(products_association)
    print('')

    # #  Products filtered from the above result with more than 2.5 lift

    # Lift is support of(A and B together )/ (support of A)* (support of B)
    # We have taken the lift value greater than 2.5 as it shows that chances of A and B together occuring are more independent of A and B

    print(
        'Product which are more than Two and a half times likely to be bought together (Lift) '
    )
    print('')
    products_association = products_association[
        products_association['lift'] >= 2.5]
    display(products_association)
Beispiel #4
0
 def transform(self):
     te = TransactionEncoder()
     te_ary = te.fit(self.df).transform(self.df)
     self.df = pd.DataFrame(te_ary, columns=te.columns_)
     return self.df
Beispiel #5
0
def apriori():
    """
    接口请求参数:{
        "table_name": "apriori_test",  # str,数据库表名
        "X": ["x0", "x1", "x2", "x3", "x4", "x5"],  # list,自变量
        "alg": "fpgrowth',  # str,关联规则算法选择["apriori", "fpgrowth"] ==》【默认值:fpgrowth】
        "dataconvert": True,  # bool,是否需要数据转换 ==》【默认值:True】
        "minSupport": "0.05",  # str,最小支持度 ==》【默认值:"0.05"】
        "max_len": "2",  # 频繁项集最大长度 ==》【默认值:None】
        "metrics": "confidence",  # 关联规则评价指标["support", "confidence", "lift", "leverage", "conviction"] ==》【默认值:confidence】
        "min_threshold": "0.8",  # 关联规则评价指标最小值 ==》【默认值:"0.8"】
    }
    :return:
    """
    log.info('Apriori_init...')
    request_data = init_route()
    try:
        from mlxtend.preprocessing import TransactionEncoder
        from mlxtend.frequent_patterns import apriori
        from mlxtend.frequent_patterns import fpgrowth
        from mlxtend.frequent_patterns import association_rules
    except:
        raise ImportError("cannot import mlxtend")
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        alg = request_data['alg']
        dataconvert = request_data['dataconvert']
        min_support = float(request_data['minSupport'])
        max_len = int(request_data['max_len'])
        metrics = request_data['metrics']
        min_threshold = float(request_data['min_threshold'])
    except Exception as e:
        log.info(e)
        raise e
    try:
        table_data = exec_sql(table_name, X)
        table_data.fillna("", inplace=True)
        data = table_data.values.tolist()
        if dataconvert:
            trans = TransactionEncoder()
            data = trans.fit(data).transform(data)
            data = pd.DataFrame(data, columns=trans.columns_)
            log.info("data columns:{}".format(data.columns.values))
            if "" in data.columns:
                data.drop(columns="", axis=1, inplace=True)
        if alg == "apriori":
            frequent_itemsets = apriori(data, min_support=min_support, max_len=max_len, use_colnames=True)
        elif alg == "fpgrowth":
            frequent_itemsets = fpgrowth(data, min_support=min_support, max_len=max_len, use_colnames=True)
        else:
            raise ValueError("input Association rules:{} is not support".format(alg))
        rules = association_rules(frequent_itemsets, metric=metrics, min_threshold=min_threshold)
        rules = rules.replace([np.inf, -np.inf], "")
        rules = format_dataframe(rules, {"lift": ".4f", "leverage": ".4f"})
        res = [
            transform_table_data_to_html({
                "title": "频繁项集结果",
                "row": frequent_itemsets.index.tolist(),
                "col": frequent_itemsets.columns.tolist(),
                "data": frequent_itemsets.values.tolist(),
            }),
            transform_table_data_to_html({
                "title": "关联规则结果",
                "row": rules.index.tolist(),
                "col": rules.columns.tolist(),
                "data": rules.values.tolist(),
            })
        ]
        response_data = {"res": res,
                         "code": "200",
                         "msg": "ok!"}
        return jsonify(response_data)
    except Exception as e:
        log.exception(e)
        return jsonify({"code": "500", "res": "", "msg": "{}".format(e.args)})
Beispiel #6
0
def main(inputs,
         infile,
         outfile,
         min_support=0.5,
         min_confidence=0.5,
         min_lift=1.0,
         min_conviction=1.0,
         max_length=None):
    """
    Parameter
    ---------
    input : str
        File path to galaxy tool parameter

    infile : str
        File paths of input vector

    outfile : str
        File path to output matrix

    min_support: float
        Minimum support

    min_confidence: float
        Minimum confidence

    min_lift: float
        Minimum lift

    min_conviction: float
        Minimum conviction

    max_length: int
        Maximum length

    """
    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)

    input_header = params['header0']
    header = 'infer' if input_header else None

    with open(infile) as fp:
        lines = fp.read().splitlines()

    if header is not None:
        lines = lines[1:]

    dataset = []
    for line in lines:
        line_items = line.split("\t")
        dataset.append(line_items)

    # TransactionEncoder learns the unique labels in the dataset and transforms the
    # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
    te = TransactionEncoder()
    te_ary = te.fit_transform(dataset)

    # Turn the encoded NumPy array into a DataFrame
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # Extract frequent itemsets for association rule mining
    # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices
    frequent_itemsets = fpgrowth(df,
                                 min_support=min_support,
                                 use_colnames=True,
                                 max_len=max_length)

    # Get association rules, with confidence larger than min_confidence
    rules = association_rules(frequent_itemsets,
                              metric="confidence",
                              min_threshold=min_confidence)

    # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction
    rules = rules[(rules['lift'] >= min_lift)
                  & (rules['conviction'] >= min_conviction)]

    # Convert columns from frozenset to list (more readable)
    rules['antecedents'] = rules['antecedents'].apply(list)
    rules['consequents'] = rules['consequents'].apply(list)

    # The next 3 steps are intended to fix the order of the association
    # rules generated, so tests that rely on diff'ing a desired output
    # with an expected output can pass

    # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents'
    rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row))
    rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row))

    # 2) Create two temporary string columns to sort on
    rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row))
    rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row))

    # 3) Sort results so they are re-producable
    rules.sort_values(by=['ant_str', 'con_str'], inplace=True)
    del rules['ant_str']
    del rules['con_str']
    rules.reset_index(drop=True, inplace=True)

    # Write association rules and metrics to file
    rules.to_csv(outfile, sep="\t", index=False)
Beispiel #7
0
def apriori_gen(dataset, min_support):
    encoder = TransactionEncoder()
    te_ary = encoder.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=encoder.columns_)

    apriori(df, min_support=min_support, use_colnames=True).to_csv("test.csv")
Beispiel #8
0
def solve_3_2():
    global users_gamedicts, gamesofallusers

    gamesofallusers = [
        list(gamedict.keys()) for fid, gamedict in users_gamedicts.items()
    ]
    # TODO: Convert the gamedict to a list of lists

    # Remove common Steam entries that are not games:
    for game in gamesofallusers:
        if 'Dota 2 Test' in game:
            game.remove('Dota 2 Test')
        if 'True Sight' in game:
            game.remove('True Sight')
        if 'True Sight: Episode 1' in game:
            game.remove('True Sight: Episode 1')
        if 'True Sight: Episode 2' in game:
            game.remove('True Sight: Episode 2')
        if 'True Sight: Episode 3' in game:
            game.remove('True Sight: Episode 3')
        if 'True Sight: The Kiev Major Grand Finals' in game:
            game.remove('True Sight: The Kiev Major Grand Finals')
        if 'True Sight: The International 2017' in game:
            game.remove('True Sight: The International 2017')
        if 'True Sight: The International 2018 Finals' in game:
            game.remove('True Sight: The International 2018 Finals')

    from mlxtend.preprocessing import TransactionEncoder
    from mlxtend.frequent_patterns import apriori

    te = TransactionEncoder()
    # TODO: Tinker around with the values
    te_ary = te.fit(gamesofallusers).transform(gamesofallusers)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
    frequent_itemsets.sort_values(by='support',
                                  ascending=False,
                                  ignore_index=True,
                                  inplace=True)

    import numpy as np
    from mlxtend.frequent_patterns import association_rules

    thresholds = np.arange(0.3, 1.0001, 0.01).tolist()
    for f in thresholds:
        f = round(f, 2)
        filtered_frequent_itemsets = frequent_itemsets[
            frequent_itemsets.support >= f]
        if len(filtered_frequent_itemsets.index) > 0:
            for t in thresholds:
                t = round(t, 2)
                conf_rules = association_rules(filtered_frequent_itemsets,
                                               metric="confidence",
                                               min_threshold=t)
                lift_rules = association_rules(filtered_frequent_itemsets,
                                               metric="lift",
                                               min_threshold=t)

                print(
                    f'f={f}, t={t}: {len(filtered_frequent_itemsets.index)} | {len(conf_rules.index)} | {len(lift_rules.index)}'
                )
        if len(filtered_frequent_itemsets.index) <= 5:
            break

    # TODO: Play around with the treshold value
    # conf_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.75)
    # lift_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.75)
    pass
def recommend():

    # Import modules.
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from mlxtend.preprocessing import TransactionEncoder


    # Load orders dataset.
    orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv')
    # orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv')

    products = pd.read_csv(r'./input_original_datasets/olist_products_dataset.csv')

    # Load translations dataset.
    translations = pd.read_csv(r'./input_original_datasets/product_category_name_translation.csv')

    # Print orders header.
    orders.head()

    # Print orders info.
    orders.info()


    # Print products header.
    products.head()


    # Print products info.
    products.info()


    # Print translations header.
    translations.head()


    # Print translations info.
    translations.info()


    # Translate product names to English.
    products = products.merge(translations, on='product_category_name', how="left")

    # Print English names.
    products['product_category_name_english']


    # # Convert product IDs to product category names.**


    # Define product category name in orders DataFrame.
    orders = orders.merge(products[['product_id','product_category_name_english']], on='product_id', how='left')

    # Print orders header.
    orders.head()


     # Drop products without a defined category.
    orders.dropna(inplace=True, subset=['product_category_name_english'])
    # Print number of unique items.
    len(orders['product_id'].unique())

    # Print number of unique categories.
    len(orders['product_category_name_english'].unique())


    # # Construct transactions from order and product data**

    # Identify transactions associated with example order.
    example1 = orders[orders['order_id'] == 'fe64170e936bc5f6a6a41def260984b9']['product_category_name_english']

    # Print example.
    example1


    # Identify transactions associated with example order.
    example2 = orders[orders['order_id'] == 'fffb9224b6fc7c43ebb0904318b10b5f']['product_category_name_english']

    # Print example.
    example2


    # # Map orders to transactions.
    #
    #

    # Recover transaction itemsets from orders DataFrame.
    transactions = orders.groupby("order_id").product_category_name_english.unique()

    # Print transactions header.
    transactions.head()


    # Plot 50 largest categories of transactions.
    transactions.value_counts()[:50].plot(kind='bar', figsize=(15,5))


    # Convert the pandas series to list of lists.
    transactions = transactions.tolist()

    # Print length of transactions.
    len(transactions)


    # Count number of unique item categories for each transaction.
    counts = [len(transaction) for transaction in transactions]
    # Print median number of items in a transaction.
    np.median(counts)


    # Print maximum number of items in a transaction.
    np.max(counts)


    # # Association Rules and Metrics


    from mlxtend.preprocessing import TransactionEncoder

    # Instantiate an encoder.
    encoder = TransactionEncoder()

    # Fit encoder to list of lists.
    encoder.fit(transactions)

    # Transform lists into one-hot encoded array.
    onehot = encoder.transform(transactions)

    # Convert array to pandas DataFrame.
    onehot = pd.DataFrame(onehot, columns = encoder.columns_)
    # Print header.
    onehot.head()


    # # Compute the support metric
    #

    # Print support metric over all rows for each column.
    onehot.mean(axis=0)


    # # Compute the item count distribution over transactions


    # Print distribution of item counts.
    onehot.sum(axis=1).value_counts()


    # # Create a column for an itemset with multiple items
    #


    # Add sports_leisure and health_beauty to DataFrame.
    onehot['sports_leisure_health_beauty'] = onehot['sports_leisure'] & onehot['health_beauty']

    # Print support value.
    onehot['sports_leisure_health_beauty'].mean(axis = 0)


    # # **Aggregate the dataset further by combining product sub-categories**
    # We can use the inclusive OR operation to combine multiple categories.
    # * True | True = True
    # * True | False = True
    # * False | True = True
    # * False | False = False

    # Merge books_imported and books_technical.
    onehot['books'] = onehot['books_imported'] | onehot['books_technical']

    # Print support values for books, books_imported, and books_technical.
    onehot[['books','books_imported','books_technical']].mean(axis=0)


    # # Compute the confidence metric
    #

    # Compute joint support for sports_leisure and health_beauty.
    joint_support = (onehot['sports_leisure'] & onehot['health_beauty']).mean()

    # Print confidence metric for sports_leisure -> health_beauty.
    joint_support / onehot['sports_leisure'].mean()


    # Print confidence for health_beauty -> sports_leisure.
    joint_support / onehot['sports_leisure'].mean()


    # # The Apriori Algorithm and Pruning

    from mlxtend.frequent_patterns import apriori

    # Apply apriori algorithm to data with min support threshold of 0.01.
    frequent_itemsets = apriori(onehot, min_support = 0.01)

    # Print frequent itemsets.
    frequent_itemsets


    # Apply apriori algorithm to data with min support threshold of 0.001.
    frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True)

    # Print frequent itemsets.
    frequent_itemsets


    # Apply apriori algorithm to data with min support threshold of 0.00005.
    frequent_itemsets = apriori(onehot, min_support = 0.00005, use_colnames = True)

    # Print frequent itemsets.
    frequent_itemsets

    # Apply apriori algorithm to data with a two-item limit.
    frequent_itemsets = apriori(onehot, min_support = 0.00005, max_len = 2, use_colnames = True)


    # # Computing association rules from Apriori output**

    from mlxtend.frequent_patterns import association_rules

    # Recover association rules using support and a minimum threshold of 0.0001.
    rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0001)

    # Print rules header.
    rules.head()
    rules.to_csv('result_datasets/result_apriori.csv')


    # # Pruning association rules

    # Recover association rules using confidence threshold of 0.01.
    rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.01)

    # Print rules.
    rules
    rules.to_csv('result_datasets/result_Pruning.csv')


    # Select rules with a consequent support above 0.095.
    rules = rules[rules['consequent support'] > 0.095]

    # Print rules.
    rules


    # # The leverage metric
    #

    # Select rules with leverage higher than 0.0.
    rules = rules[rules['leverage'] > 0.0]

    # Print rules.
    rules


    # # Visualizing patterns in metrics

    # Recover association rules with a minimum support greater than 0.000001.
    rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.000001)
Beispiel #10
0
def test_inverse_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    np.testing.assert_array_equal(np.array(data_sorted),
                                  np.array(oht.inverse_transform(expect)))
Beispiel #11
0
data = pd.read_csv("correct_data.csv")

df = pd.DataFrame(data)

df = (df.groupby(["Panel ID", "Date"]))

list_of_unique = list(df["Category"].unique())

flattened = [i for t in list_of_unique for i in t]
groceries = list(set(flattened))
rules = list(permutations(groceries, 2))
rules_df = pd.DataFrame(rules, columns=['antecedents', 'consequents'])

print(rules)

encoder = TransactionEncoder().fit(list_of_unique)

onehot = encoder.transform(list_of_unique)

onehot = pd.DataFrame(onehot, columns=encoder.columns_)

support = onehot.mean()

print(onehot.head())

print(support)


def support(x):
    # Compute support for antecedent AND consequent
    support = x.mean()
Beispiel #12
0
def test_fit_transform():
    oht = TransactionEncoder()
    trans = oht.fit_transform(dataset)
    np.testing.assert_array_equal(expect, trans)
Beispiel #13
0
def test_transform_sparse():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset, sparse=True)
    assert (isinstance(trans, csr_matrix))
    np.testing.assert_array_equal(expect, trans.todense())
Beispiel #14
0
def test_fit():
    oht = TransactionEncoder()
    oht.fit(dataset)
    assert (oht.columns_ == [
        'Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'
    ])
Beispiel #15
0
def encode(data):
	te = TransactionEncoder()
	te_ary = te.fit(l).transform(l)
	df = pd.DataFrame(te_ary, columns=te.columns_)
	return df
Beispiel #16
0
unique_items = set(np.unique(np.concatenate(np_data)))


# %%
print(unique_items)


# %%
print(len(unique_items))

# %% [markdown]
# ## FPGrowth и FPMax

# %%
# 1
te = TransactionEncoder()
te_ary = te.fit(np_data).transform(np_data)
data = pd.DataFrame(te_ary, columns=te.columns_)


# %%
data


# %%
# 2
result_fpgrowth = fpgrowth(data, min_support=0.03, use_colnames = True)
result_fpgrowth['length'] = np.fromiter(map(len, result_fpgrowth['itemsets']),dtype=int)


# %%
Beispiel #17
0
print(num_customer.describe())
###############################################################################
#Q2-B
print("\n Answer to question 2 part B :\n")
print("Unique Items = ", len(set(Groceries_df['Item'])))
###############################################################################
#Q2-C
print("\n Answer to question 2 part C :\n")
plt.hist(x=num_customer)
plt.grid(True)
plt.show()

###############################################################################
ListItem = Groceries_df.groupby(['Customer'
                                 ])['Item'].apply(list).values.tolist()
tran_encoder = TransactionEncoder()
tran_encoder_list = tran_encoder.fit(ListItem).transform(ListItem)
ItemIndicator = pd.DataFrame(tran_encoder_list, columns=tran_encoder.columns_)

# Q2-D)
print("\n Answer to question 2 part D :\n")
frq_itemsets = apriori(ItemIndicator,
                       min_support=75 / len(num_customer),
                       use_colnames=True)
frq_itemsets['length'] = frq_itemsets['itemsets'].apply(lambda x: len(x))
#number of itemdets
print("The number of itemsets: ")
print(len(frq_itemsets))
#higest k value
print("The higest K value : ")
print(max(frq_itemsets['length']))
Beispiel #18
0
#Bin Creation and Assignment to the data
CGmax, CG0, BOLmax = [], [], []
apriDF = []
for i in range(len(cgm)):
    CGmax.append(maximum(cgm.loc[i]))
    BOLmax.append(maximum(bol.loc[i]))
    CG0.append(cgm.loc[i][5])
    apriDF.append([
        CalculateBins(max(cgm.loc[i])),
        CalculateBins(cgm.loc[i][5]),
        max(bol.loc[i])
    ])

#Apriori Algorithm
#For Most Frequent Itemsets
transEnc = TransactionEncoder()
transactions = pd.DataFrame(transEnc.fit(apriDF).transform(apriDF),
                            columns=transEnc.columns_)
rules = ar(ap(transactions, min_support=0.00000000001, use_colnames=True),
           min_threshold=0.0)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
for column in ['antecedents', 'consequents']:
    rules[column] = rules[column].astype(str)
    rules[column] = rules[column].str.replace(re.escape('frozenset({'), '')
    rules[column] = rules[column].str.replace(re.escape('})'), '')
rules["SET"] = rules["antecedents"] + ',' + rules['consequents']
rules['SET'] = rules['SET'].str.replace("'", "")
rules['SET'] = rules.SET.apply(lambda x: x.split(','))
#rules.to_csv("Rules.csv")
li = rules['SET'].tolist()
y = [[(float(j)) for j in i] for i in li]
Beispiel #19
0
def makedataframe(dataset):
    transactencoder = TransactionEncoder()
    transactencoder_array = transactencoder.fit(dataset).transform(dataset)
    dataframe = pd.DataFrame(transactencoder_array,
                             columns=transactencoder.columns_)
    return dataframe
Beispiel #20
0
    # print(temp)
    transactions.append(temp)
# print(transactions)
'''使用efficient_apriori工具包'''
from efficient_apriori import apriori
itemsets, rules = apriori(transactions, min_support=0.05, min_confidence=0.3)
print('频繁项集:', itemsets)
print('关联规则:', rules)

print('-' * 100)
'''采用mlxtend.frequent_patterns工具包'''
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
# 进行one-hot编码
te = TransactionEncoder()  # 套路,当作工具使用
data = te.fit_transform(transactions)  # 套路,当作工具使用
transactions = pd.DataFrame(
    data, columns=te.columns_)  # 使用原list里的数值作为columns,否则将会由数字进行代替
itemsets = apriori(transactions, use_colnames=1, min_support=0.05)
# 按照支持度从大到小进行排序
itemsets = itemsets.sort_values(by="support", ascending=False)
print('-' * 20, '频繁项集', '-' * 20)
print(itemsets)
# 根据频繁项集计算关联规则,设置最小提升度
rules = association_rules(itemsets, metric='lift', min_threshold=1.1)
# 按照提升度从大到小进行排序
rules = rules.sort_values(by="lift", ascending=False)
print('-' * 20, '关联规则', '-' * 20)
print(rules)
Beispiel #21
0
def get_frequent_set(total_i_list, appointed_output, my_refrigerator):
    def set_giver(sort_set_list: list):
        recommended_fequent_set = []

        for r_num in range(1, 5):
            recommended_name = []
            for num, i in enumerate(sort_set_list):
                if r_num == 1:
                    # if 判斷反著寫,不符合的就進入continue
                    if not ((appointed_output.issubset(i)) and
                            (i.issubset(my_refrigerator))):
                        continue
                elif r_num == 2:
                    if not ((appointed_output.issubset(i)) and
                            (round(sort_set.iloc[num, 0], 3) > 0.3)):
                        continue
                elif r_num == 3:
                    if not i.issubset(my_refrigerator):
                        continue
                else:
                    pass

                # 只有通過前面考驗的set會被append
                '''
                i: 關聯配對
                i & my_refrigerator - appointed_output: 找出的配對有除了input以外的其他食材(冰箱內的)
                i - my_refrigerator: 配對中有使用者沒有的
                appointed_output - i:提供的配對有非input的食材
                '''
                set_info = [
                    sort_set.index[num],
                    round(sort_set.iloc[num, 0],
                          3), i, i & my_refrigerator - appointed_output,
                    i - my_refrigerator, appointed_output - i, r_num
                ]
                recommended_fequent_set.append(set_info)
                recommended_name.append(i)

                # 湊齊5個就回傳
                if len(recommended_fequent_set) == 5:
                    return recommended_fequent_set

            for selected_set in recommended_name:
                sort_set_list.remove(selected_set)

        # 不管最後找到幾個結果都回傳
        return recommended_fequent_set

    te = TransactionEncoder()
    te_ary = te.fit(total_i_list).transform(total_i_list)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
    if frequent_itemsets["itemsets"].count() < 2:
        frequent_itemsets = apriori(df,
                                    min_support=1 / (df.count()[0] - 1),
                                    use_colnames=True)
    # print(f"{frequent_itemsets.count()}")
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(
        lambda x: len(x))
    output_set = frequent_itemsets[(frequent_itemsets['length'] >=
                                    len(appointed_output) + 1)]
    # print(output_set.count())
    sort_set = output_set.sort_values(['support'], ascending=False)

    recommended_fequent_set = set_giver(sort_set['itemsets'].tolist())

    return recommended_fequent_set
Beispiel #22
0
    def setUp(self):
        database = [['a'], ['b'], ['c', 'd'], ['e']]
        te = TransactionEncoder()
        te_ary = te.fit(database).transform(database)

        self.df = pd.DataFrame(te_ary, columns=te.columns_)
Beispiel #23
0
 def run(self):
     te = TransactionEncoder()
     te_ary = te.fit(self.df).transform(self.df)
     df = pd.DataFrame(te_ary, columns=te.columns_)
     df = apriori(df, min_support=0.05)
     return df
Beispiel #24
0
    def obterner_lista(self):
        try:
            db = pymysql.connect(host=self.host,
                                 user=self.user,
                                 password=self.password,
                                 database=self.database)
            #TODO: Borrar esta linea
            self.id_cliente = 273
            self.id_local = 42
            db_cursor = db.cursor()

            #Buscamos las transacciones/productos
            query = """
                SELECT
                    GROUP_CONCAT(DISTINCT f.id_familia) productos
                FROM
                    pedido_articulo pa
                JOIN    pedido p USING (id_pedido)
                JOIN    articulo a USING (cod_interno)
                JOIN    familia f USING (id_familia)
                WHERE
                        p.id_cliente = %s
                AND     p.id_local = %s
                GROUP BY p.id_pedido
                LIMIT 2
                
                ;"""
            execute = db_cursor.execute(query,
                                        (self.id_cliente, self.id_local))
            print(" LA exceute ", execute, '\n')
            # print(db_cursor.fetchall(), '\n')
            # row = db_cursor.fetchone()
            # while row is not None:
            #     for r in row:
            #         print(r)
            #     row = db_cursor.fetchone()
            te = TransactionEncoder()
            dataset = []
            row = db_cursor.fetchone()
            print(row)
            e = ''
            while row is not None:
                data_aux = []
                for elem in row[0]:
                    if (elem != ','):
                        e = e + elem
                    else:
                        data_aux.insert(len(data_aux), int(e))
                        e = ''
                e = ''
                dataset.insert(len(dataset), data_aux)
                row = db_cursor.fetchone()
            print(dataset)
            te_ary = te.fit(dataset).transform(dataset)
            df = pd.DataFrame(te_ary, columns=te.columns_)
            pd.set_option("display.max_rows", None, "display.max_columns",
                          None)
            print(df)
            fpg = fpgrowth(df, min_support=0.6)
            print(fpg)
        except Exception as e:
            print(e)
            exit(-1)
        finally:
            db.close()
            db_cursor.close()
Beispiel #25
0
#################################

# Import necessary python libraries
import pandas as pd
import csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Read in CSV file into an array of arrays
# Make sure that your data is structured like the data given in tutorial
dataset = []
with open('apriori_data.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        dataset.append(row)
for row in dataset:
    print(row)

# Transform your data for the apriori algorithm
oht = TransactionEncoder()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
print(df)

frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
print(frequent_itemsets)

rules = association_rules(frequent_itemsets,
                          metric="confidence",
                          min_threshold=0.7)
print(rules[['antecedents', 'consequents', 'support', 'confidence']])
Beispiel #26
0
def return_support(list_sku=list_sku):
    te = TransactionEncoder()
    te_ary = te.fit(list_sku).transform(list_sku)
    df_list = pd.DataFrame(te_ary, columns=te.columns_)
    support = apriori(df_list, min_support=0.05, use_colnames=True)
    return support
Beispiel #27
0
    def reduce_rules(self, rules, instance_quorum, number_of_medoids):
        """following_breadcrumbs function finds
        Args:
            instance: The instance we want to find the paths
            reduction: The targets of our train data
            save_plots: The bar and stacked area plots for every feature will be saved
        Return:

        """
        get_itemsets = []
        for pr in rules:
            itemset = []
            for p in pr:
                itemset.append(p)
            get_itemsets.append(itemset)
        te = TransactionEncoder()
        te_ary = te.fit(get_itemsets).transform(get_itemsets)
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_itemsets = association_rules(apriori(df,
                                                      min_support=0.1,
                                                      use_colnames=True),
                                              metric="support",
                                              min_threshold=0.1).sort_values(
                                                  by="confidence",
                                                  ascending=True)
        size = 0
        k = 1
        size_of_ar = len(list(list(frequent_itemsets['antecedents'])))
        items = set()
        reduced_rules = rules
        new_feature_list = []
        for pr in reduced_rules:
            for p in pr:
                items.add(p)
            new_feature_list = list(items)
        while size < instance_quorum and k < size_of_ar:
            feature_set = set()
            for i in range(0, k):
                for j in list(list(frequent_itemsets['antecedents'])[i]):
                    feature_set.add(j)
            new_feature_list = list(feature_set)
            redundant_features = [
                i for i in self.feature_names if i not in new_feature_list
            ]
            reduced_rules = []
            for i in rules:
                if sum([1 for j in redundant_features if j in i]) == 0:
                    reduced_rules.append(i)
            size = len(reduced_rules)
            k += 1
        del get_itemsets, te, te_ary, df, frequent_itemsets
        if len(reduced_rules) < instance_quorum:
            reduced_rules = rules
            for pr in reduced_rules:
                for p in pr:
                    items.add(p)
                new_feature_list = list(items)
        if len(reduced_rules
               ) > instance_quorum:  # If we need more reduction on path level
            A = []
            for k in range(len(reduced_rules)):
                B = []
                for j in range(len(reduced_rules)):
                    if k == j:
                        B.append(0)  # or 1?
                    else:
                        sim = path_similarity(reduced_rules[k],
                                              reduced_rules[j],
                                              new_feature_list,
                                              self.min_max_feature_values)
                        # sim = path_distance(reduced_rules[k], reduced_rules[j], new_feature_list,
                        # self.min_max_feature_values) #Tested with distance metric of iForest
                        B.append(1 - sim)
                A.append(B)
            A = np.array(A)
            MS, S = kmedoids.kMedoids(A, number_of_medoids)
            medoids_sorted = sorted(S, key=lambda k: len(S[k]), reverse=True)
            k = 0
            size = 0
            reduced_rules_medoids = []
            while size < instance_quorum and k < len(medoids_sorted):
                for j in S[medoids_sorted[k]]:
                    reduced_rules_medoids.append(reduced_rules[j])
                k += 1
                size = len(reduced_rules_medoids)
            items = set()
            if len(reduced_rules_medoids) >= instance_quorum:
                reduced_rules = reduced_rules_medoids
                for pr in reduced_rules_medoids:
                    for p in pr:
                        items.add(p)
                new_feature_list = list(items)
        if len(reduced_rules) > instance_quorum:
            random.shuffle(reduced_rules)
            reduced_rules = reduced_rules[:instance_quorum]
            items = set()
            for pr in reduced_rules:
                for p in pr:
                    items.add(p)
            new_feature_list = list(items)
        return [reduced_rules, new_feature_list]
Beispiel #28
0
def main():
    train = pd.read_csv('data/train.csv')
    # test = pd.read_csv('data/test.csv')
    entity_weight = pd.read_csv('data/entity_weight.csv')

    train = train.drop(['keyword', 'location'], axis=1)

    # train_hashtags = train.copy()
    train['hashtags'] = train['text'].apply(lambda x: extract_hashtags(x))
    train_hashtags = train[train['hashtags'].map(lambda d: len(d)) > 0].copy()
    train_hashtags['target'] = train_hashtags['target'].astype(str)
    train_hashtags['t'] = train_hashtags['hashtags'] + train_hashtags['target'].apply(lambda x: [x])

    # hashtags = []
    # for x in train['hashtags']:
    #     hashtags.extend(x)
    #
    # hashtags = list(set(hashtags))

    cd = CleanData()
    data_clean = cd.normalize_text(train.copy())
    data_clean['keywords'] = data_clean['clean_text'].str.split()
    data_clean['target'] = data_clean['target'].astype('str')

    # keywords of all rows
    keys = []
    data_clean['keywords'].apply(lambda x: keys.extend(x))
    keys = list(set(keys))
    # keywords = extract_ents(keys)
    # (pd.DataFrame.from_dict(keywords, orient='index')).to_csv('data/keywords.csv')
    keywords = pd.read_csv('data/keywords.csv')

    keywords = keywords.merge(entity_weight, how='left', left_on='entity', right_on='entity')
    keywords_dic = dict(zip(keywords['keyword'], keywords['weight']))

    # messages_vector = text_to_vector_weighted_entity(data_clean, keywords_dic)
    # messages_vector.set_index('id').to_csv('data/messages_vector.csv', header=True)
    data_clean['t'] = data_clean['keywords'] + data_clean['target'].apply(lambda x: [x])

    te = TransactionEncoder()
    te_ary = te.fit(data_clean['t']).transform(data_clean['t'])
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.005, use_colnames=True)

    rules = association_rules(frequent_itemsets)
    # rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

    rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
    rules = rules[((rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})) & (rules['confidence'] >= 0.55)]
    tmp = rules[(rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})]
    rule_keywords = list(rules['antecedents'])
    rule_keywords = frozenset().union(*rule_keywords)

    with open('data/rule_keywords.txt', 'w') as f:
        for item in rule_keywords:
            f.write("%s\n" % item)

    rule_entity_keywords = {key: value for key, value in keywords_dic.items() if key in rule_keywords}

    # one_rules = rules[((rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})) & (rules['confidence'] >= 0.55)]
    # one_rules = one_rules[(one_rules['confidence'] >= 0.55)]
    # one_rules = one_rules[(one_rules['antecedent_len'] >= 2) &
    #                   (one_rules['confidence'] > 0.75) &
    #                   (one_rules['lift'] > 1.2)]

    # zero_rules = rules[(rules['consequents'] == {'0'})]
    # zero_rules = zero_rules[(zero_rules['confidence'] >= 0.55)]
    #
    # # hashtag rules
    # te_hashtags = TransactionEncoder()
    # te_ary_hashtags = te_hashtags.fit(train_hashtags['t']).transform(train_hashtags['t'])
    # df_hashtags = pd.DataFrame(te_ary_hashtags, columns=te_hashtags.columns_)
    # frequent_itemsets_hashtags = apriori(df_hashtags, min_support=0.005, use_colnames=True)
    #
    # rules_hashtags = association_rules(frequent_itemsets_hashtags, metric="confidence", min_threshold=0.6)
    # # rules_hashtags = association_rules(frequent_itemsets_hashtags, metric="lift", min_threshold=1.2)
    #
    # rules_hashtags["antecedent_len"] = rules_hashtags["antecedents"].apply(lambda x: len(x))
    #
    # one_rules_hashtags = rules_hashtags[(rules_hashtags['consequents'] == {'1'})]
    # one_rules_hashtags = one_rules_hashtags[(one_rules_hashtags['confidence'] >= 0.55)]
    #
    # zero_rules_hashtags = rules_hashtags[(rules_hashtags['consequents'] == {'0'})]
    # zero_rules_hashtags = zero_rules_hashtags[(zero_rules_hashtags['confidence'] >= 0.55)]
    #
    # # frozensets of keywords and hashtags
    # ones_keywords = list(one_rules['antecedents'])
    # ones_keywords = frozenset().union(*ones_keywords)
    # ones_hashtags = list(one_rules_hashtags['antecedents'])
    # ones_hashtags = frozenset().union(*ones_hashtags)
    #
    # zeros_keywords = list(zero_rules['antecedents'])
    # zeros_keywords = frozenset().union(*zeros_keywords)
    # zeros_hashtags = list(zero_rules_hashtags['antecedents'])
    # zeros_hashtags = frozenset().union(*zeros_hashtags)

    # vector of messages
    messages_vector = text_to_vector_ar(data_clean, ones_keywords, ones_hashtags, zeros_keywords, zeros_hashtags)
    X_train_df, X_test_df, y_train, y_test = train_test_split(data_clean['text'], data_clean['target'], random_state=0)

    X_train = messages_vector.iloc[X_train_df.index].values
    X_test = messages_vector.iloc[X_test_df.index].values

    # train model
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print('LogisticRegression scores:\n')
    print(classification_report(y_test, y_predict))

    clf_svm = svm.SVC()
    clf_svm.fit(X_train, y_train)
    y_predict = clf_svm.predict(X_test)
    print('SVM Results:\n')
    print(classification_report(y_test, y_predict))

    print('done')
Beispiel #29
0
data = pd.read_csv(r"C:\Users\acer\Downloads\BreadBasket_DMS.csv")

data = data.set_index(['Item'])
filtered = data.drop(['NONE'])
data = data.reset_index()
filtered = filtered.reset_index()
transaction_list = []

# For loop to create a list of the unique transactions throughout the dataset:
for i in filtered['Transaction'].unique():
    tlist = list(set(filtered[filtered['Transaction'] == i]['Item']))
    if len(tlist) > 0:
        transaction_list.append(tlist)

te = TransactionEncoder()
te_ary = te.fit(transaction_list).transform(transaction_list)
df2 = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df2, min_support=0.01, use_colnames=True)
#take minimum threshold
rules = association_rules(frequent_itemsets, metric='lift')

rules.sort_values('confidence', ascending=False)
print("Rules:\n")
print(rules.head(5))
print("\n")
#now categorise every rule with different range of confidence
rules['support'] = rules['support'] * 100
rules['confidence'] = rules['confidence'] * 100
rules['lift'] = rules['lift']
Beispiel #30
0
    def _freqItemsetMining(self):
        y = None
        print("Starting FIM")

        print("Transaction Encoder is started")
        te = TransactionEncoder()
        te_ary = te.fit(self._dataSet.to_numpy()).transform(
            self._dataSet.to_numpy())
        transformed_df = pd.DataFrame(te_ary, columns=te.columns_)
        print("Transaction Encoder is finished")
        print("Apriori is started")

        freqItemsets = apriori(transformed_df,
                               min_support=float(
                                   self._env.get(key="minSupport")),
                               use_colnames=True)
        print("Saving Frequent Itemsets...")
        freqItemsets.to_csv(self._env.get(key="freqItemFilePath"))
        print("Frequent Itemsets Saved")

        print("Apriori is finished")
        print("Association rules mining is started")
        rules = association_rules(freqItemsets,
                                  metric=self._env.get(key="ruleMetric"),
                                  min_threshold=float(
                                      self._env.get(key="minSupport")))
        rules = rules[
            (rules['consequents'] == {self._env.get(key="c1")}) |
            (rules['consequents'] == {self._env.get(key="c2")}) |
            (rules['consequents'] == {self._env.get(key="c3")}) |
            (rules['consequents'] == {self._env.get(key="c4")}) |
            (rules['consequents'] == {self._env.get(key="c5")}) &
            ((self._env.get(key="c1") not in (rules.antecedents.to_list())) |
             (self._env.get(key="c2") not in (rules.antecedents.to_list())) |
             (self._env.get(key="c3") not in (rules.antecedents.to_list())) |
             (self._env.get(key="c4") not in (rules.antecedents.to_list())) |
             (self._env.get(key="c5") not in (rules.antecedents.to_list())))]
        rules['antecedents'] = rules.apply(
            lambda row: FIMFunctions.convertToStringList(
                str(list(row['antecedents']))),
            axis=1)
        rules['consequents'] = rules.apply(
            lambda row: FIMFunctions.convertToStringList(
                str(list(row['consequents']))),
            axis=1)
        rules.reset_index(inplace=True)

        print("Association rules mining is finished")

        print("One Hot encoding is started")
        cols = self._dataSet.columns[:-1]
        print(cols)
        featureList = [
            self._env.get(key="p-tcp"),
            self._env.get(key="p-http"),
            self._env.get(key="p-ssh"),
            self._env.get(key="p-dns"),
            self._env.get(key="p-ftp"),
            self._env.get(key="p-sshv2"),
            self._env.get(key="l0"),
            self._env.get(key="l1"),
            self._env.get(key="l2"),
            self._env.get(key="l3"),
            # self._env.get(key="r-public"), self._env.get(key="r-private"), self._env.get(key="r-non"),
            self._env.get(key="c1"),
            self._env.get(key="c2"),
            self._env.get(key="c3"),
            self._env.get(key="c4"),
            self._env.get(key="c5"),
            self._env.get(key="d1"),
            self._env.get(key="d2"),
            self._env.get(key="d3"),
            self._env.get(key="d4"),
            self._env.get(key="d5"),
            self._env.get(key="d6"),
            self._env.get(key="d7"),
            self._env.get(key="d8"),
            self._env.get(key="d9"),
            self._env.get(key="d10")
        ]

        x = pd.DataFrame(FIMFunctions.oneHot(rules, featureList),
                         columns=featureList)
        if len(x) > 0:
            y = rules.apply(lambda row: str(row["consequents"][0].replace(
                '[', '').replace(']', '')),
                            axis=1)

        return x, y, rules