Beispiel #1
0
def onehot_encoding_transaction_data(transactions):
    encoder = TransactionEncoder().fit(transactions)
    onehot = encoder.transform(transactions)
    onehot = pd.DataFrame(onehot, columns=encoder.columns_)

    print(onehot)
    return onehot
Beispiel #2
0
def onehot_online_retail_data(transactions):
    encoder = TransactionEncoder().fit(transactions)
    onehot = encoder.transform(transactions)
    onehot = pd.DataFrame(onehot, columns=encoder.columns_)

    onehot = onehot.sample(frac=0.1)
    print("onehot.shape", onehot.shape)

    return onehot
Beispiel #3
0
dataset=[["Bread","Milk","Beer"],
        ["Bread","Diapers","Egg"],
        ["Milk","Diapers","Beer","Cola"],
        ["Bread","Milk","Diapers","Beer"],
        ["Bread","Milk","Cola"]]

import pandas as pd  
from mlxtend.preprocessing import TransactionEncoder   
from mlxtend.frequent_patterns import apriori    

te=TransactionEncoder() 
te=te.fit(dataset)  
te_ary=te.transform(dataset)

df=pd.DataFrame(te_ary,columns=te.columns_)

frequent_itemsets=apriori(df,min_support=0.6,use_colnames=True)

#to apply association rules
from mlxtend.frequent_patterns import association_rules

rules=association_rules(frequent_itemsets,metric='support',min_threshold=0.6)

cartItem=[]
init=input("1.Beer\t2.Bread\t3.Cola\t4.Diapers 5.Egg 6.Milk.\n What do you wish to buy? ")
init=init.capitalize()
cartItem=cartItem+[init]
permission='y'

checklist=['Beer','Bread','Cola','Diapers','Egg','Milk']
lng = pd.DataFrame(df.explode("speaks").speaks)
lng.reset_index(inplace=True)
lng.drop(lng[lng.speaks.str.contains("poorly")].index, inplace=True)
lng.speaks = lng.speaks.str.split().str[0]

missing = sorted(set(range(df.shape[0]))-set(lng["index"]))
for i in missing:
    lng = pd.concat([lng, pd.DataFrame({"index":[i], "speaks":["english"]})])

lng = lng.groupby("index").agg({"speaks": lambda x: x.tolist()})
df.speaks = lng.speaks

# onehot dataframe of languages
encoder = TransactionEncoder().fit(df.speaks)
onehot = encoder.transform(df.speaks)
onehot = pd.DataFrame(onehot, columns = encoder.columns_)
onehot.mean().sample(10)

# 10 most popular languages
top10 = onehot.mean().sort_values(ascending=False).iloc[:10]
plt.plot(top10, marker='o', color="#5e3a98")
plt.xticks(rotation=45)
plt.ylabel("% of users")
plt.title("top 10 most popular languages")
plt.show()

# 10 most popular languages besides English and other
top12 = onehot.mean().sort_values(ascending=False).iloc[1:12]
top12.drop("other", inplace=True)
plt.plot(top12, marker='o', color="#5e3a98")
def test_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset)
    np.testing.assert_array_equal(expect, trans)
def test_transform_sparse():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset, sparse=True)
    assert(isinstance(trans, csr_matrix))
    np.testing.assert_array_equal(expect, trans.todense())
def test_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset)
    np.testing.assert_array_equal(expect, trans)
def test_transform_sparse():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset, sparse=True)
    assert (isinstance(trans, csr_matrix))
    np.testing.assert_array_equal(expect, trans.todense())
Beispiel #9
0
transacoes = []
for i in range(0, qtdlinhas):
    linhaTransacao = []
    for j in range(0, qtdcols):
        linhaTransacao.append(str(dataset.values[i, j]))

    transacoes.append(linhaTransacao)
print(transacoes)

te = TransactionEncoder()

#Coloca em memórias as trasações e interpreta a quantidade de colunas que serão geradas durante o processamento
te.fit(transacoes)

#O objeto TransactionEncoder faz a conversão das transações em uma matriz binária onde cada linha da matriz representa uma transação
matriz_transacoes = te.transform(transacoes)

#Cria um dataframe auxiliar com a matriz binária (passo te.transform(transacoes)) de transações e as colunas obtidas (passo te.fit(transacoes))
dfAuxiliar = pd.DataFrame(matriz_transacoes, columns=te.columns_)

#Obtêm os itemsets mais frequentes com um suporte mínimo igual a 0.01. O paramêtro use_colnames significa que vamos usar os nomes das colunas do DataFrame dfAuxiliar
#para construir as regras de Associação
itemsets_freq = apriori(dfAuxiliar, min_support=0.005, use_colnames=True)

#Algumas métricas:
#- support(A->C) = support(A+C) [aka 'support'], range: [0, 1]
#- confidence(A->C) = support(A+C) / support(A), range: [0, 1]
#- lift(A->C) = confidence(A->C) / support(C), range: [0, inf]
#- leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1]
#- conviction = [1 - support(C)] / [1 - confidence(A->C)],
def recommend():

    # Import modules.
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from mlxtend.preprocessing import TransactionEncoder


    # Load orders dataset.
    orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv')
    # orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv')

    products = pd.read_csv(r'./input_original_datasets/olist_products_dataset.csv')

    # Load translations dataset.
    translations = pd.read_csv(r'./input_original_datasets/product_category_name_translation.csv')

    # Print orders header.
    orders.head()

    # Print orders info.
    orders.info()


    # Print products header.
    products.head()


    # Print products info.
    products.info()


    # Print translations header.
    translations.head()


    # Print translations info.
    translations.info()


    # Translate product names to English.
    products = products.merge(translations, on='product_category_name', how="left")

    # Print English names.
    products['product_category_name_english']


    # # Convert product IDs to product category names.**


    # Define product category name in orders DataFrame.
    orders = orders.merge(products[['product_id','product_category_name_english']], on='product_id', how='left')

    # Print orders header.
    orders.head()


     # Drop products without a defined category.
    orders.dropna(inplace=True, subset=['product_category_name_english'])
    # Print number of unique items.
    len(orders['product_id'].unique())

    # Print number of unique categories.
    len(orders['product_category_name_english'].unique())


    # # Construct transactions from order and product data**

    # Identify transactions associated with example order.
    example1 = orders[orders['order_id'] == 'fe64170e936bc5f6a6a41def260984b9']['product_category_name_english']

    # Print example.
    example1


    # Identify transactions associated with example order.
    example2 = orders[orders['order_id'] == 'fffb9224b6fc7c43ebb0904318b10b5f']['product_category_name_english']

    # Print example.
    example2


    # # Map orders to transactions.
    #
    #

    # Recover transaction itemsets from orders DataFrame.
    transactions = orders.groupby("order_id").product_category_name_english.unique()

    # Print transactions header.
    transactions.head()


    # Plot 50 largest categories of transactions.
    transactions.value_counts()[:50].plot(kind='bar', figsize=(15,5))


    # Convert the pandas series to list of lists.
    transactions = transactions.tolist()

    # Print length of transactions.
    len(transactions)


    # Count number of unique item categories for each transaction.
    counts = [len(transaction) for transaction in transactions]
    # Print median number of items in a transaction.
    np.median(counts)


    # Print maximum number of items in a transaction.
    np.max(counts)


    # # Association Rules and Metrics


    from mlxtend.preprocessing import TransactionEncoder

    # Instantiate an encoder.
    encoder = TransactionEncoder()

    # Fit encoder to list of lists.
    encoder.fit(transactions)

    # Transform lists into one-hot encoded array.
    onehot = encoder.transform(transactions)

    # Convert array to pandas DataFrame.
    onehot = pd.DataFrame(onehot, columns = encoder.columns_)
    # Print header.
    onehot.head()


    # # Compute the support metric
    #

    # Print support metric over all rows for each column.
    onehot.mean(axis=0)


    # # Compute the item count distribution over transactions


    # Print distribution of item counts.
    onehot.sum(axis=1).value_counts()


    # # Create a column for an itemset with multiple items
    #


    # Add sports_leisure and health_beauty to DataFrame.
    onehot['sports_leisure_health_beauty'] = onehot['sports_leisure'] & onehot['health_beauty']

    # Print support value.
    onehot['sports_leisure_health_beauty'].mean(axis = 0)


    # # **Aggregate the dataset further by combining product sub-categories**
    # We can use the inclusive OR operation to combine multiple categories.
    # * True | True = True
    # * True | False = True
    # * False | True = True
    # * False | False = False

    # Merge books_imported and books_technical.
    onehot['books'] = onehot['books_imported'] | onehot['books_technical']

    # Print support values for books, books_imported, and books_technical.
    onehot[['books','books_imported','books_technical']].mean(axis=0)


    # # Compute the confidence metric
    #

    # Compute joint support for sports_leisure and health_beauty.
    joint_support = (onehot['sports_leisure'] & onehot['health_beauty']).mean()

    # Print confidence metric for sports_leisure -> health_beauty.
    joint_support / onehot['sports_leisure'].mean()


    # Print confidence for health_beauty -> sports_leisure.
    joint_support / onehot['sports_leisure'].mean()


    # # The Apriori Algorithm and Pruning

    from mlxtend.frequent_patterns import apriori

    # Apply apriori algorithm to data with min support threshold of 0.01.
    frequent_itemsets = apriori(onehot, min_support = 0.01)

    # Print frequent itemsets.
    frequent_itemsets


    # Apply apriori algorithm to data with min support threshold of 0.001.
    frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True)

    # Print frequent itemsets.
    frequent_itemsets


    # Apply apriori algorithm to data with min support threshold of 0.00005.
    frequent_itemsets = apriori(onehot, min_support = 0.00005, use_colnames = True)

    # Print frequent itemsets.
    frequent_itemsets

    # Apply apriori algorithm to data with a two-item limit.
    frequent_itemsets = apriori(onehot, min_support = 0.00005, max_len = 2, use_colnames = True)


    # # Computing association rules from Apriori output**

    from mlxtend.frequent_patterns import association_rules

    # Recover association rules using support and a minimum threshold of 0.0001.
    rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0001)

    # Print rules header.
    rules.head()
    rules.to_csv('result_datasets/result_apriori.csv')


    # # Pruning association rules

    # Recover association rules using confidence threshold of 0.01.
    rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.01)

    # Print rules.
    rules
    rules.to_csv('result_datasets/result_Pruning.csv')


    # Select rules with a consequent support above 0.095.
    rules = rules[rules['consequent support'] > 0.095]

    # Print rules.
    rules


    # # The leverage metric
    #

    # Select rules with leverage higher than 0.0.
    rules = rules[rules['leverage'] > 0.0]

    # Print rules.
    rules


    # # Visualizing patterns in metrics

    # Recover association rules with a minimum support greater than 0.000001.
    rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.000001)
Beispiel #11
0
df = pd.DataFrame(data)

df = (df.groupby(["Panel ID", "Date"]))

list_of_unique = list(df["Category"].unique())

flattened = [i for t in list_of_unique for i in t]
groceries = list(set(flattened))
rules = list(permutations(groceries, 2))
rules_df = pd.DataFrame(rules, columns=['antecedents', 'consequents'])

print(rules)

encoder = TransactionEncoder().fit(list_of_unique)

onehot = encoder.transform(list_of_unique)

onehot = pd.DataFrame(onehot, columns=encoder.columns_)

support = onehot.mean()

print(onehot.head())

print(support)


def support(x):
    # Compute support for antecedent AND consequent
    support = x.mean()

    return support
Beispiel #12
0
#
# 본문 예시 예제
#

# 데이터 생성
tr_data = [['milk', 'bread'], ['butter'], ['beer', 'diapers'],
           ['milk', 'bread', 'butter'], ['bread']]

# 데이터 전 처리
tr_encoder = TransactionEncoder()
tr_encoder.fit(tr_data)
tr_encoder.columns_
# ['beer', 'bread', 'butter', 'diapers', 'milk']

tr_encoder_ary = tr_encoder.transform(tr_data)
tr_encoder_ary = np.where(tr_encoder_ary == True, 1, 0)
df = pd.DataFrame(tr_encoder_ary, columns=tr_encoder.columns_)
print(df)
#    beer  bread  butter  diapers  milk
# 0     0      1       0        0     1
# 1     0      0       1        0     0
# 2     1      0       0        1     0
# 3     0      1       1        0     1
# 4     0      1       0        0     0

# 연관 모델 적합
freq_items = apriori(df, min_support=0.4, use_colnames=True, n_jobs=-1)

freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
Beispiel #13
0
data = data.split(',')
for values in data:
    dataset.append(values.strip('\n'))


'''
dataset=[["Bread", "Milk", "Beer"],
 ["Bread", "Diapers", "Eggs"],
 ["Milk", "Diapers", "Beer", "Cola"],
 ["Bread", "Milk", "Diapers", "Beer"],
 ["Bread", "Milk", "Cola"]]

#########################################################
te=TransactionEncoder()
te=te.fit(dataset)           # Reads unique values 
te_ary=te.transform(dataset) # Converts array into a matrx
df=pd.DataFrame(te_ary, columns=te.columns_)

#########################################################
frequent_items=apriori(df,min_support=0.6,use_colnames=True)
frequent_items
rules = association_rules(frequent_items,
                         metric= 'confidence',
                        min_threshold=0.5)
#########################################################
# Shopping Part
def item():
  print("Items available are-", available)
  print("Your cart contains",cart)
  buy=input("What do you want to buy? \nItem:")
  buy=buy.title()