Beispiel #1
0
def run_eclat(dataset):
    #converting data into nupy array
    X = dataset.iloc[:, :].values

    # constructign the list
    subjects = []
    current_subject = []
    current_student = X[0, 0]
    for i in range(len(X)):
        if current_student == X[i, 0]:
            current_subject.append(X[i, 1])
        else:
            if len(current_subject) > 1:
                subjects.append(current_subject)
            current_subject = []
            current_student = X[i, 0]

    results = []
    results = eclat(subjects, out=results, supp=0.09, zmin=2)

    found = []
    for i in range(len(results)):
        found.append(list(results[i][0]))

    return pd.DataFrame(found)
Beispiel #2
0
def runForAll(OUTPUT_DIR, XML_OUTPUT_DIR, FV_CSV, FV_PICKLE):
	createDirIfNotExists(OUTPUT_DIR)
	createDirIfNotExists(XML_OUTPUT_DIR)

	transactions,idMap = loadData(FV_CSV)

	print "Mining itemsets"
	itemsets = eclat(tracts=transactions, target='c', supp=-MIN_SUPPORT, zmin=MIN_ITEMSET_SIZE)
	print "Done mining itemsets"
	if itemsets is None:
		print "No itemsets found"
		sys.exit(0)
	print "Found",len(itemsets),"itemsets"
	labels,fvValues,fvidmap,labidmap,featureVector = pickle.load(open(FV_PICKLE, 'rb'))
	fvinv_map = {v: k for k, v in fvidmap.iteritems()}
	labinv_map = {v: k for k, v in labidmap.iteritems()}


	decoded_transactions = [ set(expandCpreds(t, idMap, fvinv_map)) for t in transactions ]
	

	args = [ (t, itemsets, MIN_CONF, transactions, decoded_transactions, decodeLabel(idt, labels, labinv_map), idt, fvinv_map, idMap, OUTPUT_DIR, XML_OUTPUT_DIR, labels, labinv_map) for idt,t in enumerate(transactions) if len(t) > 0 ]

	if PARALLELIZE:
		# Let's make this parallel now for speed...
		p = mp.Pool(mp.cpu_count())
		p.map(checkConsistency, args)
	else:
		for arg in args:
			checkConsistency(arg)
def eclatRules(das, country):
    if country != 'all':
        das=das[das.ship_to_country.apply(lambda x: True if x in country else False)]
    das=das.groupby(['order_line'])['ordered_item'].apply(list).values.tolist()
    eclat_result=eclat(das, supp=10, zmax=4, report='aCL', target='r', eval='l')
    eclat_result=pd.DataFrame(eclat_result)
    eclat_result.columns=['antecedants', 'consequents', 'support', 'confidence', 'list']
    eclat_result.consequents=eclat_result.consequents.astype('str')
    eclat_result= eclat_result[eclat_result.consequents != '()']
    return eclat_result
Beispiel #4
0
def do_eclat(min_support):
    transactions = []
    result = return_data_rows()
    for row in result:
        one_trip_locations = []
        for location in row.polyline:
            x = str(location.lat)[:8] + "," + str(location.long)[:8]
            one_trip_locations.append(x)
        numbers_tuple = tuple(one_trip_locations)
        transactions.append(numbers_tuple)
    start_time = datetime.now()
    rules = eclat(transactions, supp=min_support * 10)
    rules.sort(key=lambda x: x[1], reverse=True)
    end_time = datetime.now()
    print(rules)
    diff = (end_time - start_time)
    print("eclat longs : ", diff.total_seconds(), "seconds")
Beispiel #5
0
def transactions2freqitems(transactions_by_labels: List[List],
                           supp=0.05,
                           zmin=1,
                           zmax=3) -> List[tuple]:

    supp = int(supp * 100)
    itemsets = set()
    for trans in transactions_by_labels:
        itemset = [
            tuple(sorted(r[0]))
            for r in eclat(trans, supp=supp, zmin=zmin, zmax=zmax)
        ]
        itemsets |= set(itemset)

    itemsets = list(itemsets)

    logging.info("Total {:d} itemsets mined".format(len(itemsets)))
    return itemsets
Beispiel #6
0
def runForEPointGroups():
	createDirIfNotExists(EPG_OUTPUT_DIR)
	createDirIfNotExists(EPG_XML_OUTPUT_DIR)

	# Read in input directory
	csvFiles = [ getFilename(f) for f in os.listdir(EPG_FV_DIR) if isFile(EPG_FV_DIR, f) and f.endswith('.csv')]
	for fname in csvFiles:
		EPG_FV_CSV = os.path.join(EPG_FV_DIR, fname + '.csv')
		EPG_FV_PICKLE = os.path.join(EPG_FV_DIR, fname + '.pickle')
		EPG_i_OD = os.path.join(EPG_OUTPUT_DIR, fname)
		EPG_i_OD_XML = os.path.join(EPG_XML_OUTPUT_DIR, fname)

		createDirIfNotExists(EPG_i_OD)
		createDirIfNotExists(EPG_i_OD_XML)

		
		transactions,idMap = loadData(EPG_FV_CSV)
		
		#Make sure transactions are not all empty
		if len(idMap) <= 0:
			continue

		print "Mining itemsets for",fname
		itemsets = eclat(tracts=transactions, target='c', supp=-MIN_SUPPORT, zmin=MIN_ITEMSET_SIZE)
		print "Done mining itemsets for",fname
		if itemsets is None:
			print "No itemsets found"
			sys.exit(0)
		print "Found",len(itemsets),"itemsets"
		labels,fvValues,fvidmap,labidmap,featureVector = pickle.load(open(EPG_FV_PICKLE, 'rb'))
		fvinv_map = {v: k for k, v in fvidmap.iteritems()}
		labinv_map = {v: k for k, v in labidmap.iteritems()}

		decoded_transactions = [ set(expandCpreds(t, idMap, fvinv_map)) for t in transactions ]


		itemsets = sorted([(isup, iset) for iset,isup in itemsets], reverse=True)
		itemsets = [(iset, isup) for isup,iset in itemsets]

		for idt,t in enumerate(transactions):
			checkGroupConsistency((t, itemsets, MIN_CONF, transactions, decoded_transactions, decodeLabel(idt, labels, labinv_map), idt, fvinv_map, idMap, EPG_i_OD, EPG_i_OD_XML, labels, labinv_map))
Beispiel #7
0
tid = int(argv[1])
if tid < -2:
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid < 0:
    print(apriori.__doc__)
else:
    tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4],
              [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]]
    print('transactions:')
    for t in tracts:
        print(t)
    if tid < 1:
        print('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2):
            print r
    elif tid < 2:
        print('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2):
            print r
    elif tid < 3:
        print('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2):
            print r
    else:
        print('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'):
            print r
Beispiel #8
0
def ele_to_str(ele):
    global db
    return util.ele_to_str(db, ele)

sets = map(set, transactions)
print('running algorithm', file=sys.stderr)
before = time.time()
if algo == "apriori":
    s = fim.apriori(transactions, supp=2)
    s = sorted(s, key=lambda x:x[1])
    for items,supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8'))
if algo == "eclat":
    s = fim.eclat(transactions, supp=2)
    s = sorted(s, key=lambda x:x[1])
    for items,supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key = lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
        condition = map(ele_to_str, condition)
        consequence = ele_to_str(consequence)
        print(u"{:6.2f}% of {} eles: If {} then {}".format(confidence_percent, support_count, " & ".join(condition), consequence))
elif algo == "arules":
    rules = fim.arules(transactions, supp=2, conf=75, report='aCl', eval='l', thresh=30)
    #random.shuffle(rules) # lambda x: x[3])
    #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence %
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpmax
from mlxtend.frequent_patterns import association_rules
from fim import eclat
from tabulate import tabulate

df = pd.read_csv("plants_preprocessed.csv")
print("================ FP-Max ==================")
dataset = df.values.tolist()
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
itemsets = fpmax(df, min_support=0.001, use_colnames=True,max_len = 10)
print(itemsets)

print("\n\n RULES based on FP growth : \n\n")
rules = association_rules(itemsets, min_threshold=0.0001,support_only=True)
print(rules[['antecedents', 'consequents', 'support']])

print("================ ECLAT-Max ================ ")
itemsets = eclat(dataset,target='m',supp=2,report='s')
print(tabulate(itemsets,  headers=['Itemset', 'Support'], tablefmt='pretty'))
Beispiel #10
0
dataset = list()
for line in lines:
    strpline = line.rstrip()
    arr = strpline.split(',')
    newline = []
    for i in range(len(arr)):
        if arr[i] == 'y':
            newline.append(i)
    if arr[0] == 'republican':
        newline.append(100)
    else:
        newline.append(200)
    dataset.append(newline)

print("\npart a:", file=outf)
frequentset = eclat(dataset, supp=20, zmin=1)
print("with 20%% support, there are %d frequent itemsets." % len(frequentset),
      file=outf)

print("\npart b:", file=outf)
print("Top ten itemsets:", file=outf)
frequentset.sort(key=takeSecond, reverse=True)
i = 0
for r in frequentset:
    if (i < 10):
        print(r, file=outf)
        i += 1
    else:
        break

print("\npart c:", file=outf)
Beispiel #11
0
import pandas as pd

dataset = pd.read_csv('Market_Basket_Optimisation.csv', header=None)
n = len(dataset)
transactions = []
for i in range(0, n):
    transaction = []
    m = len(dataset.values[i])
    for j in range(0, m):
        data = str(dataset.values[i, j])
        if data != "nan":
            transaction.append(data)
    transactions.append(transaction)

results = []
from fim import eclat

rules = eclat(tracts=transactions, zmin=1)
rules.sort(key=lambda x: x[1], reverse=True)
    tracts = {
        (1, 2, 3): 0.5,
        (1, 4, 5): 1.2,
        (2, 3, 4): 0.8,
        (1, 2, 3, 4): 0.3,
        (2, 3): 1.5,
        (1, 2, 4): 0.9,
        (4, 5): 0.6,
        (1, 2, 3, 4): 1.0,
        (3, 4, 5): 0.7
    }
    print('transactions:')
    for t in tracts:
        print(t, tracts[t])
    if tid < 1:
        print('apriori(tracts, supp=-1.6, zmin=2):')
        for r in apriori(tracts, supp=-1.6, zmin=2):
            print(r)
    elif tid < 2:
        print('eclat(tracts, supp=-1.6, zmin=2):')
        for r in eclat(tracts, supp=-1.6, zmin=2):
            print(r)
    elif tid < 3:
        print('fpgrowth(tracts, supp=-1.6, zmin=2):')
        for r in fpgrowth(tracts, supp=-1.6, zmin=2):
            print(r)
    else:
        print('fim(tracts, supp=-1.6, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-1.6, zmin=2, report='#'):
            print(r)
Beispiel #13
0
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid <  0:
    print(apriori.__doc__)
else:
    tracts = [ [ 1, 2, 3 ],
               [ 1, 4, 5 ],
               [ 2, 3, 4 ],
               [ 1, 2, 3, 4 ],
               [ 2, 3 ],
               [ 1, 2, 4 ],
               [ 4, 5 ],
               [ 1, 2, 3, 4 ],
               [ 3, 4, 5 ],
               [ 1, 2, 3 ] ]
    print('transactions:')
    for t in tracts: print(t)
    if   tid < 1:
        print  ('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2): print r
    elif tid < 2:
        print  ('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2): print r
    elif tid < 3:
        print  ('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2): print r
    else:
        print  ('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
Beispiel #14
0
    global db
    return util.ele_to_str(db, ele)


sets = map(set, transactions)
print('running algorithm', file=sys.stderr)
before = time.time()
if algo == "apriori":
    s = fim.apriori(transactions, supp=2)
    s = sorted(s, key=lambda x: x[1])
    for items, supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp / float(trcount),
                                     "|".join(items)).encode('utf-8'))
if algo == "eclat":
    s = fim.eclat(transactions, supp=2)
    s = sorted(s, key=lambda x: x[1])
    for items, supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key=lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
        condition = map(ele_to_str, condition)
        consequence = ele_to_str(consequence)
        print(u"{:6.2f}% of {} eles: If {} then {}".format(
            confidence_percent, support_count, " & ".join(condition),
            consequence))
elif algo == "arules":
    rules = fim.arules(transactions,
Beispiel #15
0
data = pd.read_excel(r'D:\Accidents.xlsx')

df = pd.DataFrame(
    data,
    columns=[
        "Date d'accident DAT_ACC", "Heure d'accident HEU_ACC",
        "Decoupage Geographique COD_PRV", "Etat chaussée COD_ETA_CHA",
        "Etat surface COD_ETA_SUR", "Lumière COD_LUM", "Lumière COD_LUM",
        "Code Agglomeration COD_AGG", "Localisation LOC",
        "Type carrefours COD_TYP_CAR_AGG", "Profils en long COD_PRO_LON",
        "Type collision COD_TYP_COL", "Obstacle heurtés COD_OBS_HRT",
        "Point de choc initial COD_POI_CHO", "manœuvre COD_MAN"
    ])

##transformation de la dataframe en liste
freq_itemsets = df.values.tolist()

##Application de l'algorithme eclat
res = eclat(freq_itemsets, supp=10, zmin=10)

##transformer la liste en dataframe

to_df = pd.DataFrame(res, columns=["Frequent Items", "Support"])

##affichage du support sous forme de pourcentage

for index, row in to_df.iterrows():

    print(row['Frequent Items'], "\t\t",
          '%.2f' % (row['Support'] / len(df) * 100), "%")
Beispiel #16
0
    def mbasket(self,
                data_p,
                support_par,
                confidence_par,
                method='apriori',
                lift_par=1.2):
        """
        :param
        :return:
        """
        start0 = time()
        ## Apriori analysis + association rules creation
        # find association rules with default settings
        rules = pd.DataFrame()
        if method == 'fpgrowth':
            start = time()
            frequent_itemsets = pd.DataFrame(fpgrowth(data_p[1],
                                                      supp=support_par * 100,
                                                      zmin=1,
                                                      target='s',
                                                      report='s',
                                                      mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("fpgrowth() -", run_time, "s")

        elif method == 'eclat':
            start = time()
            frequent_itemsets = pd.DataFrame(eclat(data_p[1],
                                                   supp=support_par * 100,
                                                   zmin=1,
                                                   target='s',
                                                   report='s',
                                                   mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("eclat() -", run_time, "s")

        elif method == 'relim':
            start = time()
            frequent_itemsets = pd.DataFrame(relim(data_p[1],
                                                   supp=support_par * 100,
                                                   zmin=1,
                                                   target='s',
                                                   report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("relim() -", run_time, "s")

        elif method == 'sam':
            start = time()
            frequent_itemsets = pd.DataFrame(sam(data_p[1],
                                                 supp=support_par * 100,
                                                 zmin=1,
                                                 target='s',
                                                 report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("sam() -", run_time, "s")

        elif method == 'ista':
            start = time()
            frequent_itemsets = pd.DataFrame(ista(data_p[1],
                                                  supp=support_par * 100,
                                                  zmin=1,
                                                  report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("ista() -", run_time, "s")

        else:
            start = time()
            frequent_itemsets = pd.DataFrame(apriori(data_p[1],
                                                     supp=support_par * 100,
                                                     zmin=1,
                                                     target='s',
                                                     report='s',
                                                     mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            rules = self.find_rules(frequent_itemsets, lift_par,
                                    confidence_par)
            print("apriori() -", run_time, "s")

        # users with antedecents from the rules calculated above
        if rules.shape[0] > 0:
            pivot_binary_tr = data_p[0].transpose()
            recom = {}
            pb = {}
            rules['antecedents'] = rules['antecedents'].apply(
                lambda x: frozenset(x))
            for user in pivot_binary_tr.columns:
                products_bought = pivot_binary_tr.index[pivot_binary_tr[user]
                                                        == 1]
                pb[user] = products_bought
                suitable_rules = []
                for ante in rules['antecedents'].iteritems():
                    if ante[1].issubset(products_bought):  # do poprawy
                        suitable_rules.append(ante[0])
                recom[user] = suitable_rules

            recom = pd.DataFrame.from_dict(recom,
                                           orient='index').stack().reset_index(
                                               level=1,
                                               drop=True).reset_index()
            recom.columns = ['review_profilename', 'Rule']

            # products bought - zeby wykluczyc te produkty z rekomendacji
            pb2 = pd.DataFrame.from_dict(pb,
                                         orient='index').stack().reset_index(
                                             level=1, drop=True).reset_index()
            pb2.columns = ['review_profilename', 'antecedents1']

            rule_cons = rules[['antecedents', 'consequents']].reset_index()
            rule_cons['consequents'] = [
                i for i, *_ in rule_cons['consequents']
            ]  # change format from frozensets to normal
            rule_cons['antecedents'] = [
                list(i) for i in rule_cons['antecedents']
            ]
            rule_cons.columns = ['Rule', 'antecedents', 'consequents']
            recom = recom.merge(rule_cons, on='Rule')
            recom.drop_duplicates(['review_profilename', 'consequents'],
                                  keep='first',
                                  inplace=True)

            # exclude from recommendations products already bought
            recom_already_satisfied = pb2.merge(
                recom,
                left_on=['review_profilename', 'antecedents1'],
                right_on=['review_profilename', 'consequents'])
            recom_already_satisfied['beer_already_known'] = 1
            sum_recom_already_satisfied = recom_already_satisfied[
                'beer_already_known'].sum()

            recom_new = recom.merge(
                recom_already_satisfied[[
                    'review_profilename', 'Rule', 'consequents',
                    'beer_already_known'
                ]],
                on=['review_profilename', 'Rule', 'consequents'],
                how='left')
            recom_new = recom_new[recom_new['beer_already_known'] != 1][[
                'review_profilename', 'Rule', 'antecedents', 'consequents'
            ]]
        else:
            rule_cons = 0
            recom_new = 0
            sum_recom_already_satisfied = 0

        mba_time = round(time() - start0, 2)
        print("mbasket() -", mba_time, "s")

        return [
            rule_cons, recom_new, mba_time, sum_recom_already_satisfied,
            run_time
        ]
Beispiel #17
0
lyf_tag_list = ICCv1['lyf_tag_list']
items = [str(elem['_id']) for elem in lyf_tag_list.find()]
sample = [elem['tags'] for elem in lyf_tag_log.find()]

tag_dict = dict()
for elem in lyf_tag_list.find():
    tag_dict[str(elem['_id'])] = elem['tag']

all_tag = []
for elem in sample:
    all_tag.extend(elem)

diff = set(all_tag) - set(items)

for elem in diff:
    tag_dict[elem] = ''

res = fim.eclat(sample, supp=50)

# 根据tag_dict显示tag中文名称


def element(lis):
    for elem in lis:
        if isinstance(elem, unicode):
            print tag_dict[elem]
        elif isinstance(elem, int):
            print elem
        else:
            element(elem)
# and the dataset from https://www.kaggle.com/puneetbhaya/online-retail


# Importing the libraries
import numpy as np
import pandas as pd
from fim import eclat
help(eclat)


######## Data Preprocessing
dataset = pd.read_excel('Online Retail.xlsx')

#Replace null descriptions with the stock code 
for i, d in dataset[dataset['Description'].isnull()].iterrows():
    dataset['Description'][i] = "Code-" + str(d['StockCode'])
#group into baskets
grouped = dataset.groupby('InvoiceNo')

#rearrange into a list
transactions = []
for name,group in grouped:
    transactions.append(list(group['Description'].map(str)))

##### Training 
report = eclat(transactions, target='s', supp=1, zmin=2)

##compare with apriori from same module. Different from apyori.... maybe have to investigate further
from fim import apriori
help(apriori)
areport = apriori(transactions, report='l,c', target='r', supp=1, zmin=2)