Beispiel #1
0
  def mine_seqs(self, baskets):
    print "mining frequent sequences"
    freq_seqs = seqmining.freq_seq_enum(baskets, len(baskets) * self.min_support)
    print "found {} frequent sequences".format(len(freq_seqs))

    total = len(baskets)
    seq_supports = {frozenset(x[0]) : float(x[1])/total for x in freq_seqs}
    out_seqs = []
    # test the lift of each rule
    for seq in freq_seqs:
      seq_key = frozenset(seq[0])
      if len(seq_key) < 2:
        continue
      sup_total = seq_supports[seq_key]
      sup_split_max = 0
      for token in seq_key:
        token_set = frozenset((token,))
        sup_token = seq_supports[token_set]
        sup_rest = seq_supports[seq_key-token_set]

        if sup_token*sup_rest > sup_split_max:
          sup_split_max = sup_token*sup_rest

      if (sup_total/sup_split_max) > self.min_lift:
        out_seqs.append((seq[0], sup_total, sup_total/sup_split_max))
    
    freq_seqs = out_seqs
    print "found {} sequences with sufficient lift".format(len(freq_seqs))
    freq_seqs = self.nonmax_suppression_seqs(freq_seqs)
    print "found {} maximal sequences".format(len(freq_seqs))
    return freq_seqs
def run_sequence_mining(students, min_support, filter_type):
    sequences = []
    for student in students:
        course_list = []
        semester_keys = list(student.course_seq_dict.keys())
        semester_keys.sort()
        for seq_int in semester_keys:
            student_sem_hist = student.course_seq_dict[seq_int]
            temp_list = []
            for x in student_sem_hist:
                if filter_type == 'generic_ge':
                    if x.type == "ge":
                        temp_list.append("GE")
                elif filter_type == "cs_only":
                    if x.type == "core" or x.type == "bonus":
                        temp_list.append(x.name)
                else:
                    temp_list.append(str(seq_int)+"_"+x.name)
            temp_list.sort()
            course_list.extend(temp_list)
        sequences.append(course_list)
    print("init run")
    datas = seqmining.freq_seq_enum(sequences, min_support)
    output_data = []
    for data in datas:
        output_data.append([data[1], [data[0]]])
    return output_data
Beispiel #3
0
    def fit(self, train_data=None):
        """
        Fit the model
        :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
            If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
            Otherwise, run FSM using `pymining.seqmining` (slower).
        """

        if train_data is None:
            if self.spmf_path is None or self.db_path is None:
                raise ValueError(
                    "You should set db_path and spfm_path before calling fit() without arguments."
                )

            self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            # call spmf
            command = ' '.join([
                self.spmf_algorithm, self.db_path, self.output_path,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            # parse back output from text file
            self._parse_spfm_output()
        else:
            # use pymining
            self.logger.info(
                'Using pymining.seqmining (python) for Frequent Sequence Mining'
            )
            sequences = train_data['sequence'].values
            msup = int(
                self.minsup *
                len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
            self.logger.info(
                'Mining frequent sequences (minsup={})'.format(msup))
            self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.info('Building the prefix tree')
        self.tree = SmartTree()
        self.root_node = self.tree.set_root()
        for pattern, support in self.freq_seqs:
            if len(pattern) == 1:
                # add node to root
                self.tree.create_node(pattern[0],
                                      parent=self.root_node,
                                      data={"support": support})
            elif len(pattern) > 1:
                # add entire path starting from root
                self.tree.add_path(self.root_node, pattern, support)
            else:
                raise ValueError('Frequent sequence of length 0')
        self.logger.info('Training completed')
def run_sequnce_testing():
    sequences = ("CSC100,CSC200,CSC300,MATH100,MATH200",
                 "CSC100,MATH100,MATH200,CSC200,CSC300",
                 "CSC100,MATH200,CSC300,CSC200,MATH100",
                 "CSC200,MATH100,CSC100,CSC300,MATH200",
                 "MATH100,MATH200,CSC100,CSC300,CSC200",
                 )
    datas = seqmining.freq_seq_enum(sequences, 4)
    for data in datas:
        print(data)
Beispiel #5
0
def fun3():
    seqs = ('caabc', 'abcb', 'cabc', 'abbca')
    freq_seqs = seqmining.freq_seq_enum(seqs, 2)
    print("The default sequence data is:")
    print(seqs)

    time.sleep(1)
    input("Press any key to see the discovered frequent seqences")
    print(sorted(freq_seqs))
    print(
        "\nNote:(('a', 'b'), 4) means:\nIn the given seqs tuple, there are 4 times that 'b' appears after 'a'"
    )
    input("Press Any button to return to CONTENT")
    def fit(self, seqs):
        """Takes a list of list of sequences ."""

        if self.spmf_path and self.db_path:
            self.logger.info("Using SPMF")
            #parse minsup
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            #call spmf
            algorithm = "PrefixSpan"
            command = ' '.join([
                algorithm, self.db_path, self.outputPath,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            #parse back output from text file
            self._parse_SPMF_output()
        elif seqs:
            msup = self.minsup * len(
                seqs) if 0 <= self.minsup <= 1 else self.minsup

            self.logger.debug('Mining frequent sequences')
            self.freq_seqs = seqmining.freq_seq_enum(seqs, msup)
        else:
            self.logger.error(
                "No sequence dabase path nor sequence list provided.")

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.debug('Building frequent sequence tree')
        self.tree = SmartTree()
        self.rootNode = self.tree.set_root()
        for tuple in self.freq_seqs:
            if len(tuple[0]) == 1:
                #add node to root
                self.tree.create_node(tuple[0][0],
                                      parent=self.rootNode,
                                      data={"support": tuple[1]})
            elif len(tuple[0]) > 1:
                #add entire path starting from root
                self.tree.add_path(self.rootNode, tuple[0], tuple[1])
            else:
                raise NameError('Frequent sequence of length 0')
        self.logger.debug('Tree completed')
Beispiel #7
0
def frequentSequences(gs, samples=None, minsup=None, window_len=3, days=1, granularity=None):
        """
        Returns frequent sequences mined using pymining
        gs : gSpan object
        minsup : minimum support to decide for frequency of a sequence
                 ([1,2,1,3], [5,1,1,5]) with minsup=2 will return [1,1]
        window_len : specifies how many sequences are in a window
        days : used for getting sequences, check getSequences for detail
        granularity : is the "speed"(or step) of the window

        Example : window_len=7 days=1
                  The sequences will be daily subgraphs and window of 7
                  will act like a week
        """
        seqs = getSequences(gs, samples, days)

        # "Defaults" to window
        if not granularity:
                granularity = window_len
        if not minsup:
                minsup = window_len

        res = OrderedDict()
        freq_id = 0
        window_start = 0
        window_end   = window_len
        while window_end < len(seqs):
                window = seqs[window_start:window_end]
                freq_seqs = seqmining.freq_seq_enum(window, minsup)

                # chr to int conversion
                res[freq_id] = []
                for fseq in freq_seqs:
                        seq_items = []
                        for i in fseq[0]:
                                seq_items.append(ord(i))
                        res[freq_id].append((tuple(seq_items), fseq[1]))

                window_start += granularity
                window_end += granularity
                #print("Window {}: {}".format(freq_id, res[freq_id]))
                freq_id += 1

        return res
Beispiel #8
0
def mine_patterns(data, MINING_METHOD, CONFUSION_MATRIX):
    if (MINING_METHOD == 'seq_mining'):
        mined_patterns = {
            KEY: sorted(seqmining.freq_seq_enum([data[KEY][trace_id] for trace_id in data[KEY]], min_support=2))
            for KEY in CONFUSION_MATRIX
        }
    if (MINING_METHOD == 'item_mining'):
        mined_patterns_to_be_preprocessed = {
            KEY: itemmining.relim(itemmining.get_relim_input([data[KEY][trace_id] for trace_id in data[KEY]]), min_support=2)
            for KEY in CONFUSION_MATRIX
        }

        mined_patterns = {
            KEY: [
                (tuple(element), mined_patterns_to_be_preprocessed[KEY][element])
                for element in mined_patterns_to_be_preprocessed[KEY]]
            for KEY in CONFUSION_MATRIX
        }
    return mined_patterns
Beispiel #9
0
def strict_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        curr_time = 0
        aversion = "c"
        for row in spamreader:
            # print("{0} {1} {2}".format(row[2], row[3],row[6]))
            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = "".join(temp)
                temp = []
            if row[2] == "":
                continue
            curr_time += int(row[3])
            if row[6] == "0":
                aversion = "a"
            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = "c"

    # for i in list(dict.values()):
    #     print(" ".join(i))
    print(dict.values())

    print("Strict period \n\n")
    freq_seqs = seqmining.freq_seq_enum(list(dict.values()), 8)

    for fs in sorted(freq_seqs):
        print(fs)
    print("\n")
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Beispiel #10
0
from pymining import itemmining
from pymining import seqmining
import sys

if (len(sys.argv) != 3):
    print(
        "Please provide the data file and the minimum support as input, e.g., python freq_seq.py ./output.txt 40"
    )
    sys.exit(-1)

f = open(sys.argv[1], 'r')
lines = f.read().splitlines()
seqs = []
for s in lines:
    seq = s.split("---")[1]
    seq = seq[1:-1]
    seqs.append(seq.split(", "))

freq_seqs = seqmining.freq_seq_enum(seqs, int(sys.argv[2]))
for p in freq_seqs:
    print(p)
Beispiel #11
0
import pandas as pd
import numpy as np
from pymining import seqmining, itemmining, assocrules, perftesting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

studydf = pd.read_csv("studydf.csv")
violationdf = studydf[['INSPECTION DATE','VIOLATION CODE']].reset_index()
violationdf['VIOLATION CODE'] = violationdf['VIOLATION CODE'].astype('str')
plotseries = violationdf['VIOLATION CODE'].value_counts().iloc[0:20]
ax = sns.barplot(y=plotseries.index, x=plotseries.values, palette="Blues_d")
testdf = violationdf.groupby(['CAMIS'])['VIOLATION CODE'].apply(list)
minelist = testdf.tolist()[0:10]
minelist = tuple(tuple(x) for x in minelist)
relim_input = itemmining.get_relim_input(minelist)
item_sets = itemmining.relim(relim_input, min_support=2)
rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5)
print rules
freq_seqs = seqmining.freq_seq_enum(minelist, 2)
print freq_seqs
rules2 = assocrules.mine_assoc_rules(item_sets, min_support=1, min_confidence=0.5)
print rules2
Beispiel #12
0
        #     assocpairresults.write(str(rules))
        # rulepitdurlens.append(len(rules))

        # allpat = seqmining.freq_seq_enum(oneFamily, sup)
        # allpat  = seqmining.freq_seq_enum(pitdurfam, sup)
        # with open("Freq_{}.txt".format(sup),"w") as freqresults:
        #     freqresults.write("this many:" + str(len(allpat)))
        #     freqresults.write(str(allpat))
        # patpitlens.append(len(allpat))
        
        chrinputlist = []
        for pitlist in inputlist:
            chrinputlist.append("".join([chr(x) for x in pitlist]))
        # print(chrallpit)

        allpat = seqmining.freq_seq_enum(chrinputlist, sup)
        # print("length allpat:"+str(len(allpat)))
        # print(allpat)
        patpitlens.append(len(allpat))

        # allpat = seqmining.freq_seq_enum(durfam, sup)
        # with open("Freqpit_{}.txt".format(sup),"w") as freqresults:
        #     freqresults.write("this many:" + str(len(allpat)))
        #     freqresults.write(str(", ".join([[ord(x) for x in strings] for strings in allpat])))
        # patpitdurlens.append(len(allpat))

import numpy as np
print(len(patpitlens))
# print(len(patpitlens)/26)
print(len(allpit))
itemmatrix = np.array(itempitlens).reshape(len(allpit)-1,29)
Beispiel #13
0
    #     assocpairresults.write(str(rules))
    # rulepitdurlens.append(len(rules))

    # allpat = seqmining.freq_seq_enum(oneFamily, sup)
    # allpat  = seqmining.freq_seq_enum(pitdurfam, sup)
    # with open("Freq_{}.txt".format(sup),"w") as freqresults:
    #     freqresults.write("this many:" + str(len(allpat)))
    #     freqresults.write(str(allpat))
    # patpitlens.append(len(allpat))

    chrallpit = []
    for pitlist in allpit:
        chrallpit.append("".join([chr(x) for x in pitlist]))
    # print(chrallpit)

    allpat = seqmining.freq_seq_enum(chrallpit, sup)
    print("length allpat:" + str(len(allpat)))
    # print(allpat)
    patdurlens.append(len(allpat))

    # allpat = seqmining.freq_seq_enum(durfam, sup)
    with open("Freqpit_{}.txt".format(sup), "w") as freqresults:
        freqresults.write("this many:" + str(len(allpat)))
        freqresults.write(
            str(", ".join([[ord(x) for x in strings] for strings in allpat])))
    # patpitdurlens.append(len(allpat))

import matplotlib.pyplot as plt
plt.switch_backend('agg')
plt.figure()
# plt.plot(rulepitdurlens)
def get_sequences(seqs, size):
    freq_seqs = seqmining.freq_seq_enum(seqs, size)
    return freq_seqs
			if output[i]==2:
				str1=str1+'G'   #Ground
			if output[i]==3:
				str1=str1+'p'    #player
			if output[i]==4:
				str1=str1+'b'   #boundary
			if output[i]==5:
				str1=str1+'C'   #crowd
			if output[i]==6:
				str1=str1+'S'   #sky
		s.append(str1)
		#print(str1)
	#print(len(s))	
	
	print (s)
	freq_seqs = seqmining.freq_seq_enum(s, 2)
	print(sorted(freq_seqs))
	a=list(freq_seqs)
	a1=[]
	print(a)
	for i in range(len(freq_seqs)):
		s1=""
		for j in range(len(a[i][0])):
			s1 = s1+(a[i][0][j])
		a1.append(s1)
	print(a1)
	file = open("t.txt","w")
	for i in range(len(a1)):
		file.write(a1[i]+"\n")
	file.close() 
	'''ans={0:"pitch",1:"batsmen",2:"ground",3:"player",
Beispiel #16
0
def Apriori_four(data_tuple):
    seqs = ('caabc', 'abcb', 'cabc', 'abbca')
    freq_seqs = seqmining.freq_seq_enum(seqs, 2)
    print( sorted(freq_seqs) )
Beispiel #17
0
def get_seqence(corpus):
    report = seqmining.freq_seq_enum(corpus, 1)

    return sorted(report)
    def handle(self, *args, **kwargs):
        TARGET_JOB = 71
        SPLITID = 12
        job_obj = Job.objects.filter(pk=TARGET_JOB)[0]
        split_obj = Split.objects.filter(pk=SPLITID)[0]

        training_df, test_df = get_encoded_logs(job_obj)

        test_df1 = test_df.copy()
        test_df2 = test_df.copy()
        test_df3 = test_df.copy()

        # todo: retrieve lime explanation

        # RETRIEVE&SAVE TS
        ts_exp_job, _ = Explanation.objects.get_or_create(
            type=ExplanationTypes.TEMPORAL_STABILITY.value,
            split=split_obj,
            predictive_model=job_obj.predictive_model,
            job=job_obj)
        ts = temporal_stability(ts_exp_job,
                                training_df,
                                test_df1,
                                explanation_target=None)

        # RETRIEVE&SAVE LIMETS
        limets_exp_job, _ = Explanation.objects.get_or_create(
            type=ExplanationTypes.LIME.value,
            split=split_obj,
            predictive_model=job_obj.predictive_model,
            job=job_obj)
        lime_ts = lime_temporal_stability(limets_exp_job,
                                          training_df,
                                          test_df2,
                                          explanation_target=None)

        # SAVE GOLD
        gold = test_df3[['trace_id', 'label']]

        # todo: retrieve confusion matrix

        ts = {
            asdf: {
                uuu + '1' if uuu[-1:] == '_' else uuu: ts[asdf][uuu]
                for uuu in ts[asdf]
            }
            for asdf in ts
        }
        lime_ts = {
            asdf: {
                uuu + '1' if uuu[-1:] == '_' else uuu: lime_ts[asdf][uuu]
                for uuu in lime_ts[asdf]
            }
            for asdf in lime_ts
        }

        trace_ids = set(gold['trace_id'])
        confusion_matrix = {
            'tp': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)]['prefix_' +
                                                   str(len(ts[str(tid)]))]
                ['predicted'] == 'true' and ts[str(tid)]
                ['prefix_' +
                 str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[
                     gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'tn': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)]['prefix_' +
                                                   str(len(ts[str(tid)]))]
                ['predicted'] == 'false' and ts[str(tid)]
                ['prefix_' +
                 str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[
                     gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'fp': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)][
                    'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'true'
                and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))]
                ['predicted'] != ('true' if gold[
                    gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'fn': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)][
                    'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'false'
                and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))]
                ['predicted'] != ('true' if gold[
                    gold['trace_id'] == tid]['label'].values[0] else 'false')
            ]
        }

        limefeats = {
            k: {
                key: [
                    element for element in sorted(
                        [(pref, lime_ts[key]
                          ['prefix_' +
                           str(job_obj.encoding.prefix_length)][pref]['value'],
                          lime_ts[key]['prefix_' +
                                       str(job_obj.encoding.prefix_length)]
                          [pref]['importance']) for pref in lime_ts[key]
                         ['prefix_' + str(job_obj.encoding.prefix_length)]],
                        key=lambda x: (x[2], x[1]),
                        reverse=True if k in ['tp', 'fp'] else False
                        # reverse order of lime values if the prediction is negative
                    )
                ]
                for key in confusion_matrix[k] if 'prefix_' +
                str(job_obj.encoding.prefix_length) in lime_ts[key]
            }
            for k in confusion_matrix
        }

        freq_seqs = {'tp': {}, 'tn': {}, 'fp': {}, 'fn': {}}

        # todo: retrive patterns
        CONFUSION_MATRIX = ['tp', 'tn', 'fp', 'fn']

        LIMEFEATS = {
            'abs_lime': False,
            'tp': 0.2,
            'tn': 0.2,
            'fp': 0.2,
            'fn': 0.2,
            'top': 10,
            'outputfile': None
        }
        FREQ_SEQS = {
            'tp': 10,
            'tn': 10,
            'fp': 10,
            'fn': 10,
            'top': 15,
            'outputfile': None,
            'RECOMPUTEDoutputfile': None,
        }
        ABSENCE = {
            'tp': 0.1,
            'tn': 0.1,
            'fp': 0.1,
            'fn': 0.1,
            'ABSENCEoutputfile': None
        }

        MINING_METHOD = 'item_mining'

        print(
            'Initial CONFUSION MATRIX:\n', *[
                '\tlimefeats ' + KEY + ':' + str(len(limefeats[KEY]))
                for KEY in CONFUSION_MATRIX
            ], '\n', *[
                '\tfreq_seqs ' + KEY + ':' + str(len(freq_seqs[KEY]))
                for KEY in CONFUSION_MATRIX
            ])

        available_values = {}
        for KEY in CONFUSION_MATRIX:
            available_values[KEY] = {}
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    if event[0].split('_')[0] not in available_values[KEY]:
                        available_values[KEY][event[0].split('_')[0]] = set()
                    available_values[KEY][event[0].split('_')[0]].add(event[1])

        filtered_limefeats = {
            KEY: {
                tid: [
                    event for event in limefeats[KEY][tid]
                    if ((not LIMEFEATS['abs_lime']) and (
                        (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or
                        (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY])))
                    or
                    (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY])
                ]
                for tid in limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        prefiltered_limefeats = {
            KEY: {
                tid: [
                    event for event in limefeats[KEY][tid]
                    if ((not LIMEFEATS['abs_lime']) and (
                        (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or
                        (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY])))
                    or
                    (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY])
                ]
                for tid in limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        filtered_limefeats_mine = {
            KEY: {
                tid: prefiltered_limefeats[KEY][tid][0:LIMEFEATS['top']]
                for tid in prefiltered_limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        for KEY in CONFUSION_MATRIX:
            for k in list(filtered_limefeats[KEY]):
                if len(filtered_limefeats[KEY][k]) == 0:
                    del filtered_limefeats[KEY][k]

        def tassellate_numbers(element):
            element = str(element)
            return str(element).split('.')[0][0] + '0' \
                if \
                '.' in str(element) \
                and \
                len(str(element)) <= 5 \
                else \
                str(element).split('.')[0][0:4] \
                    if \
                    '.' in str(element) \
                    and \
                    len(str(element)) >= 10 \
                    else \
                    element

        def retrieve_right_len(element, available_values):
            if '_' in element:
                return len(available_values[element.split('_')[0]])
            else:
                retval = []
                for attribute in available_values:
                    if any([
                            str(element) == str(tassellate_numbers(value))
                            for value in available_values[attribute]
                    ]):
                        retval += [len(available_values[attribute])]
                return max(retval)

        def weight_freq_seqs(KEY, available_values, element, limefeats):
            print(element[0])
            print(
                'frequency:', element[1], ' * ', 'len w/out absences: ',
                len([el for el in element[0] if 'absence' not in el]), ' * ',
                'sum of enumerator of possible values: ',
                sum([
                    retrieve_right_len(el, available_values[KEY])
                    for el in element[0] if 'absence' not in el
                ]), ' / ',
                'amount of examples in the field of confusion matrix: ',
                len(limefeats[KEY]), ' = ',
                (element[1] *
                 len([el for el in element[0] if 'absence' not in el]) * sum([
                     retrieve_right_len(el, available_values[KEY])
                     for el in element[0] if 'absence' not in el
                 ])) / len(limefeats[KEY]))
            return (
                element[1]  # *
                # len([el for el in element[0] if 'absence' not in el]) *
                # sum([retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el])
            ) / len(limefeats[KEY])

        filtered_freq_seqs_old = {
            KEY: sorted([
                element for element in freq_seqs[KEY]
                if weight_freq_seqs(KEY, available_values, element, limefeats)
                >= FREQ_SEQS[KEY]
            ],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        prefiltered_freq_seqs = {
            KEY: sorted([
                element for element in freq_seqs[KEY]
                if weight_freq_seqs(KEY, available_values, element, limefeats)
                >= FREQ_SEQS[KEY]
            ],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        #todo: is this the actual topK?
        filtered_freq_seqs = {
            KEY: prefiltered_freq_seqs[KEY][0:FREQ_SEQS['top']]
            for KEY in CONFUSION_MATRIX
        }

        print(
            'CONFUSION MATRIX after filtering:\n', *[
                '\tlimefeats ' + KEY + ':' + str(len(filtered_limefeats[KEY]))
                for KEY in CONFUSION_MATRIX
            ], '\n', *[
                '\tfreq_seqs ' + KEY + ':' + str(len(filtered_freq_seqs[KEY]))
                for KEY in CONFUSION_MATRIX
            ])

        def printout_freq_seqs(output_obj, output_file, maxlinelength=5000):
            with open(output_file, 'w+') as f:
                f.write(prettyjson(output_obj, maxlinelength=maxlinelength))

        if (LIMEFEATS['outputfile'] is not None
                or FREQ_SEQS['outputfile'] is not None):
            print('Start saving results..')
            if (LIMEFEATS['outputfile'] is not None):
                printout_freq_seqs(filtered_limefeats,
                                   LIMEFEATS['outputfile'],
                                   maxlinelength=5000)
            if (FREQ_SEQS['outputfile'] is not None):
                printout_freq_seqs(filtered_freq_seqs,
                                   FREQ_SEQS['outputfile'],
                                   maxlinelength=200)
            print('Results saved.')
        else:
            print('FILTERED_LIMEFEATS:\n', filtered_limefeats)
            print('FILTERED_FREQ_SEQS:\n', filtered_freq_seqs)

        print('Computing absence...')

        attributes = {}
        for KEY in CONFUSION_MATRIX:
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    attribute_name = event[0]
                    if attribute_name not in attributes:
                        attributes[attribute_name] = set()
                    attributes[attribute_name].add(event[1])

        attributes_occurrences = {
            'tp': collections.Counter(),
            'fp': collections.Counter(),
            'tn': collections.Counter(),
            'fn': collections.Counter()
        }

        for KEY in CONFUSION_MATRIX:
            found_stuff = []
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    found_stuff += [tassellate_numbers(event[1])]

            attributes_occurrences[KEY].update(found_stuff)

        characterised_attributes_occurrences = {}
        for KEY in CONFUSION_MATRIX:
            characterised_attributes_occurrences[KEY] = {}
            for attribute in attributes:
                if attribute not in characterised_attributes_occurrences[KEY]:
                    characterised_attributes_occurrences[KEY][
                        attribute] = dict()
                for attr in attributes[attribute]:
                    characterised_attributes_occurrences[KEY][attribute][
                        tassellate_numbers(attr)] = 0
        for KEY in CONFUSION_MATRIX:
            for occ in attributes_occurrences[KEY]:
                for attr in characterised_attributes_occurrences[KEY]:
                    if occ in characterised_attributes_occurrences[KEY][attr]:
                        characterised_attributes_occurrences[KEY][attr][
                            occ] = attributes_occurrences[KEY][occ]
            for attr in characterised_attributes_occurrences[KEY]:
                characterised_attributes_occurrences[KEY][attr]['Total'] = sum(
                    [
                        characterised_attributes_occurrences[KEY][attr]
                        [element] for element in
                        characterised_attributes_occurrences[KEY][attr]
                    ])
        print('Absence computed.')
        print('The absence AFTER filtering is:\n',
              characterised_attributes_occurrences)

        print(
            'RE-computing the sequence pattern result after applying the thresholds...'
        )

        static_attr = [
            #    'Age',
            #    'ClaimValue',
            #    'CType',
            #    'ClType',
            #    'PClaims',
        ]
        limefeats_static_dinamic = {}
        for KEY in CONFUSION_MATRIX:
            limefeats_static_dinamic[KEY] = {}
            for tid in filtered_limefeats[KEY]:
                limefeats_static_dinamic[KEY][tid] = {
                    'static': [],
                    'dynamic': [
                        att for att in filtered_limefeats[KEY][tid]
                        if not any([
                            att[0].startswith(static_att)
                            for static_att in static_attr
                        ])
                    ]
                }
                current_static_attributes = [
                    att for att in filtered_limefeats[KEY][tid] if any([
                        att[0].startswith(static_att)
                        for static_att in static_attr
                    ])
                ]
                for s_attr in static_attr:
                    curr_attributes = [
                        att for att in current_static_attributes
                        if att[0].startswith(s_attr)
                    ]
                    if len(curr_attributes) > 0:
                        if KEY in ['tp', 'fp']:
                            limefeats_static_dinamic[KEY][tid]['static'] += [
                                max(curr_attributes, key=lambda x: x[2])
                            ]
                        elif KEY in ['tn', 'fn']:
                            limefeats_static_dinamic[KEY][tid]['static'] += [
                                max(curr_attributes, key=lambda x: x[2])
                            ]
                        else:
                            print('Something bad happened')

        dynamic_data = {
            KEY: {
                tid: [
                    # (element[0].split('_')[0] + '_' +  element[1])
                    (element[0] + '_' + element[1]) for element in sorted(
                        [
                            k for k in limefeats_static_dinamic[KEY][tid]
                            ['dynamic']
                        ],
                        # key=lambda x: (x[0].split('_')[1], x[0].split('_')[0])
                        key=lambda x: x[0])
                ]
                for tid in limefeats_static_dinamic[KEY]
                if len(limefeats_static_dinamic[KEY][tid]['dynamic']) > 0
            }
            for KEY in CONFUSION_MATRIX
        }

        static_data = {
            KEY: {
                tid: [
                    (element[0].split('_')[0] + '_' +
                     tassellate_numbers(element[1]))
                    # (element[0] + '_' + tassellate_numbers(element[1]))
                    for element in sorted([
                        k for k in limefeats_static_dinamic[KEY][tid]['static']
                    ],
                                          key=lambda x: (x[0].split('_')[1], x[
                                              0].split('_')[0]))
                ]
                for tid in limefeats_static_dinamic[KEY]
                if len(limefeats_static_dinamic[KEY][tid]['static']) > 0
            }
            for KEY in CONFUSION_MATRIX
        }

        data = {}
        for KEY in CONFUSION_MATRIX:
            data[KEY] = {}
            for tid in limefeats[KEY]:
                if tid in static_data[KEY] and tid in dynamic_data[KEY]:
                    data[KEY][
                        tid] = static_data[KEY][tid] + dynamic_data[KEY][tid]
                elif tid in static_data[KEY]:
                    data[KEY][tid] = static_data[KEY][tid]
                elif tid in dynamic_data[KEY]:
                    data[KEY][tid] = dynamic_data[KEY][tid]

        if (MINING_METHOD == 'seq_mining'):
            freq_seqs_after_filter = {
                'tp':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['tp'][tid] for tid in data['tp']], 2)),
                'tn':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['tn'][tid] for tid in data['tn']], 2)),
                'fp':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['fp'][tid] for tid in data['fp']], 2)),
                'fn':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['fn'][tid] for tid in data['fn']], 2)),
            }
        if (MINING_METHOD == 'item_mining'):
            freq_seqs_after_filter = {
                'tp':
                itemmining.relim(itemmining.get_relim_input(
                    [data['tp'][tid] for tid in data['tp']]),
                                 min_support=2),
                'tn':
                itemmining.relim(itemmining.get_relim_input(
                    [data['tn'][tid] for tid in data['tn']]),
                                 min_support=2),
                'fp':
                itemmining.relim(itemmining.get_relim_input(
                    [data['fp'][tid] for tid in data['fp']]),
                                 min_support=2),
                'fn':
                itemmining.relim(itemmining.get_relim_input(
                    [data['fn'][tid] for tid in data['fn']]),
                                 min_support=2),
            }

            freq_seqs_after_filter = {
                KEY: [(tuple(element), freq_seqs_after_filter[KEY][element])
                      for element in freq_seqs_after_filter[KEY]]
                for KEY in CONFUSION_MATRIX
            }

        filtered_freq_seqs_after_filter_old = {
            KEY: sorted([[
                element[0],
                weight_freq_seqs(KEY, available_values, element, limefeats)
            ] for element in freq_seqs_after_filter[KEY] if weight_freq_seqs(
                KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY]],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        # todo: filter topK
        filtered_freq_seqs_after_filter = {
            KEY: filtered_freq_seqs_after_filter_old[KEY][0:FREQ_SEQS['top']]
            for KEY in CONFUSION_MATRIX
        }

        print('Sequence pattern recomputed successfully.')

        if (FREQ_SEQS['outputfile'] is not None):
            print('Start saving results..')
            printout_freq_seqs(filtered_freq_seqs_after_filter,
                               FREQ_SEQS['RECOMPUTEDoutputfile'],
                               maxlinelength=200)
            print('Results saved.')
        else:
            print('RECOMPUTED_FREQ_SEQS:\n', filtered_freq_seqs_after_filter)

        print('Done, cheers!')
        return confusion_matrix, data, freq_seqs_after_filter, filtered_freq_seqs_after_filter
Beispiel #19
0
    def seqs_mining(self):

        freq_seqs = seqmining.freq_seq_enum(self.transactions, self.min_sup)

        return sorted(freq_seqs)
Beispiel #20
0
 def __artist_freq_seq(self, artist_patterns, min_size):
     freq_artist = seqmining.freq_seq_enum(artist_patterns, min_size)
     self.profile['artist_pattern'] = self.__seq_pattern(freq_artist)
Beispiel #21
0
import pandas as pd
# See: https://github.com/bartdag/pymining
from pymining import itemmining, assocrules, seqmining

enrollment = pd.read_csv('course-enrollment.csv')
grouped = (enrollment.groupby('user_id')['course_id'].agg({
    'course_id':
    lambda x: x.tolist()
}).reset_index()[-40000:])

events = grouped.course_id.values.tolist()

relim_input = itemmining.get_relim_input(events)
report = itemmining.relim(relim_input, min_support=2)

print('Associative rules:')
rules = assocrules.mine_assoc_rules(report, min_support=5, min_confidence=0.6)
rules_df = pd.DataFrame.from_records(
    rules, columns=['from', 'to', 'support', 'confidence'])
print(rules_df.sort_values('support', ascending=False).head(10))

print('Frequent sequences: ')
freq_seqs = seqmining.freq_seq_enum(events, 5)
freq_seqs_df = pd.DataFrame.from_records(list(freq_seqs),
                                         columns=['sequence', 'support'])
freq_seqs_df['sequence_len'] = freq_seqs_df.sequence.apply(len)
print(freq_seqs_df[freq_seqs_df.sequence_len > 1].sort_values(
    ['sequence_len', 'support'], ascending=False).head(10))