def mine_seqs(self, baskets): print "mining frequent sequences" freq_seqs = seqmining.freq_seq_enum(baskets, len(baskets) * self.min_support) print "found {} frequent sequences".format(len(freq_seqs)) total = len(baskets) seq_supports = {frozenset(x[0]) : float(x[1])/total for x in freq_seqs} out_seqs = [] # test the lift of each rule for seq in freq_seqs: seq_key = frozenset(seq[0]) if len(seq_key) < 2: continue sup_total = seq_supports[seq_key] sup_split_max = 0 for token in seq_key: token_set = frozenset((token,)) sup_token = seq_supports[token_set] sup_rest = seq_supports[seq_key-token_set] if sup_token*sup_rest > sup_split_max: sup_split_max = sup_token*sup_rest if (sup_total/sup_split_max) > self.min_lift: out_seqs.append((seq[0], sup_total, sup_total/sup_split_max)) freq_seqs = out_seqs print "found {} sequences with sufficient lift".format(len(freq_seqs)) freq_seqs = self.nonmax_suppression_seqs(freq_seqs) print "found {} maximal sequences".format(len(freq_seqs)) return freq_seqs
def run_sequence_mining(students, min_support, filter_type): sequences = [] for student in students: course_list = [] semester_keys = list(student.course_seq_dict.keys()) semester_keys.sort() for seq_int in semester_keys: student_sem_hist = student.course_seq_dict[seq_int] temp_list = [] for x in student_sem_hist: if filter_type == 'generic_ge': if x.type == "ge": temp_list.append("GE") elif filter_type == "cs_only": if x.type == "core" or x.type == "bonus": temp_list.append(x.name) else: temp_list.append(str(seq_int)+"_"+x.name) temp_list.sort() course_list.extend(temp_list) sequences.append(course_list) print("init run") datas = seqmining.freq_seq_enum(sequences, min_support) output_data = [] for data in datas: output_data.append([data[1], [data[0]]]) return output_data
def fit(self, train_data=None): """ Fit the model :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence". If None, run FSM using SPFM over the sequence database stored in `self.db_path`. Otherwise, run FSM using `pymining.seqmining` (slower). """ if train_data is None: if self.spmf_path is None or self.db_path is None: raise ValueError( "You should set db_path and spfm_path before calling fit() without arguments." ) self.logger.info('Using SPFM (Java) for Frequent Sequence Mining') if 0 <= self.minsup <= 1: percentage_min_sup = self.minsup * 100 else: raise NameError("SPMF only accepts 0<=minsup<=1") # call spmf command = ' '.join([ self.spmf_algorithm, self.db_path, self.output_path, str(percentage_min_sup) + '%' ]) callSPMF(self.spmf_path, command) # parse back output from text file self._parse_spfm_output() else: # use pymining self.logger.info( 'Using pymining.seqmining (python) for Frequent Sequence Mining' ) sequences = train_data['sequence'].values msup = int( self.minsup * len(sequences)) if 0 <= self.minsup <= 1 else self.minsup self.logger.info( 'Mining frequent sequences (minsup={})'.format(msup)) self.freq_seqs = seqmining.freq_seq_enum(sequences, msup) self.logger.info('{} frequent sequences found'.format( len(self.freq_seqs))) self.logger.info('Building the prefix tree') self.tree = SmartTree() self.root_node = self.tree.set_root() for pattern, support in self.freq_seqs: if len(pattern) == 1: # add node to root self.tree.create_node(pattern[0], parent=self.root_node, data={"support": support}) elif len(pattern) > 1: # add entire path starting from root self.tree.add_path(self.root_node, pattern, support) else: raise ValueError('Frequent sequence of length 0') self.logger.info('Training completed')
def run_sequnce_testing(): sequences = ("CSC100,CSC200,CSC300,MATH100,MATH200", "CSC100,MATH100,MATH200,CSC200,CSC300", "CSC100,MATH200,CSC300,CSC200,MATH100", "CSC200,MATH100,CSC100,CSC300,MATH200", "MATH100,MATH200,CSC100,CSC300,CSC200", ) datas = seqmining.freq_seq_enum(sequences, 4) for data in datas: print(data)
def fun3(): seqs = ('caabc', 'abcb', 'cabc', 'abbca') freq_seqs = seqmining.freq_seq_enum(seqs, 2) print("The default sequence data is:") print(seqs) time.sleep(1) input("Press any key to see the discovered frequent seqences") print(sorted(freq_seqs)) print( "\nNote:(('a', 'b'), 4) means:\nIn the given seqs tuple, there are 4 times that 'b' appears after 'a'" ) input("Press Any button to return to CONTENT")
def fit(self, seqs): """Takes a list of list of sequences .""" if self.spmf_path and self.db_path: self.logger.info("Using SPMF") #parse minsup if 0 <= self.minsup <= 1: percentage_min_sup = self.minsup * 100 else: raise NameError("SPMF only accepts 0<=minsup<=1") #call spmf algorithm = "PrefixSpan" command = ' '.join([ algorithm, self.db_path, self.outputPath, str(percentage_min_sup) + '%' ]) callSPMF(self.spmf_path, command) #parse back output from text file self._parse_SPMF_output() elif seqs: msup = self.minsup * len( seqs) if 0 <= self.minsup <= 1 else self.minsup self.logger.debug('Mining frequent sequences') self.freq_seqs = seqmining.freq_seq_enum(seqs, msup) else: self.logger.error( "No sequence dabase path nor sequence list provided.") self.logger.info('{} frequent sequences found'.format( len(self.freq_seqs))) self.logger.debug('Building frequent sequence tree') self.tree = SmartTree() self.rootNode = self.tree.set_root() for tuple in self.freq_seqs: if len(tuple[0]) == 1: #add node to root self.tree.create_node(tuple[0][0], parent=self.rootNode, data={"support": tuple[1]}) elif len(tuple[0]) > 1: #add entire path starting from root self.tree.add_path(self.rootNode, tuple[0], tuple[1]) else: raise NameError('Frequent sequence of length 0') self.logger.debug('Tree completed')
def frequentSequences(gs, samples=None, minsup=None, window_len=3, days=1, granularity=None): """ Returns frequent sequences mined using pymining gs : gSpan object minsup : minimum support to decide for frequency of a sequence ([1,2,1,3], [5,1,1,5]) with minsup=2 will return [1,1] window_len : specifies how many sequences are in a window days : used for getting sequences, check getSequences for detail granularity : is the "speed"(or step) of the window Example : window_len=7 days=1 The sequences will be daily subgraphs and window of 7 will act like a week """ seqs = getSequences(gs, samples, days) # "Defaults" to window if not granularity: granularity = window_len if not minsup: minsup = window_len res = OrderedDict() freq_id = 0 window_start = 0 window_end = window_len while window_end < len(seqs): window = seqs[window_start:window_end] freq_seqs = seqmining.freq_seq_enum(window, minsup) # chr to int conversion res[freq_id] = [] for fseq in freq_seqs: seq_items = [] for i in fseq[0]: seq_items.append(ord(i)) res[freq_id].append((tuple(seq_items), fseq[1])) window_start += granularity window_end += granularity #print("Window {}: {}".format(freq_id, res[freq_id])) freq_id += 1 return res
def mine_patterns(data, MINING_METHOD, CONFUSION_MATRIX): if (MINING_METHOD == 'seq_mining'): mined_patterns = { KEY: sorted(seqmining.freq_seq_enum([data[KEY][trace_id] for trace_id in data[KEY]], min_support=2)) for KEY in CONFUSION_MATRIX } if (MINING_METHOD == 'item_mining'): mined_patterns_to_be_preprocessed = { KEY: itemmining.relim(itemmining.get_relim_input([data[KEY][trace_id] for trace_id in data[KEY]]), min_support=2) for KEY in CONFUSION_MATRIX } mined_patterns = { KEY: [ (tuple(element), mined_patterns_to_be_preprocessed[KEY][element]) for element in mined_patterns_to_be_preprocessed[KEY]] for KEY in CONFUSION_MATRIX } return mined_patterns
def strict_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] curr_time = 0 aversion = "c" for row in spamreader: # print("{0} {1} {2}".format(row[2], row[3],row[6])) if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = "".join(temp) temp = [] if row[2] == "": continue curr_time += int(row[3]) if row[6] == "0": aversion = "a" if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = "c" # for i in list(dict.values()): # print(" ".join(i)) print(dict.values()) print("Strict period \n\n") freq_seqs = seqmining.freq_seq_enum(list(dict.values()), 8) for fs in sorted(freq_seqs): print(fs) print("\n") print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
from pymining import itemmining from pymining import seqmining import sys if (len(sys.argv) != 3): print( "Please provide the data file and the minimum support as input, e.g., python freq_seq.py ./output.txt 40" ) sys.exit(-1) f = open(sys.argv[1], 'r') lines = f.read().splitlines() seqs = [] for s in lines: seq = s.split("---")[1] seq = seq[1:-1] seqs.append(seq.split(", ")) freq_seqs = seqmining.freq_seq_enum(seqs, int(sys.argv[2])) for p in freq_seqs: print(p)
import pandas as pd import numpy as np from pymining import seqmining, itemmining, assocrules, perftesting import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set() studydf = pd.read_csv("studydf.csv") violationdf = studydf[['INSPECTION DATE','VIOLATION CODE']].reset_index() violationdf['VIOLATION CODE'] = violationdf['VIOLATION CODE'].astype('str') plotseries = violationdf['VIOLATION CODE'].value_counts().iloc[0:20] ax = sns.barplot(y=plotseries.index, x=plotseries.values, palette="Blues_d") testdf = violationdf.groupby(['CAMIS'])['VIOLATION CODE'].apply(list) minelist = testdf.tolist()[0:10] minelist = tuple(tuple(x) for x in minelist) relim_input = itemmining.get_relim_input(minelist) item_sets = itemmining.relim(relim_input, min_support=2) rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5) print rules freq_seqs = seqmining.freq_seq_enum(minelist, 2) print freq_seqs rules2 = assocrules.mine_assoc_rules(item_sets, min_support=1, min_confidence=0.5) print rules2
# assocpairresults.write(str(rules)) # rulepitdurlens.append(len(rules)) # allpat = seqmining.freq_seq_enum(oneFamily, sup) # allpat = seqmining.freq_seq_enum(pitdurfam, sup) # with open("Freq_{}.txt".format(sup),"w") as freqresults: # freqresults.write("this many:" + str(len(allpat))) # freqresults.write(str(allpat)) # patpitlens.append(len(allpat)) chrinputlist = [] for pitlist in inputlist: chrinputlist.append("".join([chr(x) for x in pitlist])) # print(chrallpit) allpat = seqmining.freq_seq_enum(chrinputlist, sup) # print("length allpat:"+str(len(allpat))) # print(allpat) patpitlens.append(len(allpat)) # allpat = seqmining.freq_seq_enum(durfam, sup) # with open("Freqpit_{}.txt".format(sup),"w") as freqresults: # freqresults.write("this many:" + str(len(allpat))) # freqresults.write(str(", ".join([[ord(x) for x in strings] for strings in allpat]))) # patpitdurlens.append(len(allpat)) import numpy as np print(len(patpitlens)) # print(len(patpitlens)/26) print(len(allpit)) itemmatrix = np.array(itempitlens).reshape(len(allpit)-1,29)
# assocpairresults.write(str(rules)) # rulepitdurlens.append(len(rules)) # allpat = seqmining.freq_seq_enum(oneFamily, sup) # allpat = seqmining.freq_seq_enum(pitdurfam, sup) # with open("Freq_{}.txt".format(sup),"w") as freqresults: # freqresults.write("this many:" + str(len(allpat))) # freqresults.write(str(allpat)) # patpitlens.append(len(allpat)) chrallpit = [] for pitlist in allpit: chrallpit.append("".join([chr(x) for x in pitlist])) # print(chrallpit) allpat = seqmining.freq_seq_enum(chrallpit, sup) print("length allpat:" + str(len(allpat))) # print(allpat) patdurlens.append(len(allpat)) # allpat = seqmining.freq_seq_enum(durfam, sup) with open("Freqpit_{}.txt".format(sup), "w") as freqresults: freqresults.write("this many:" + str(len(allpat))) freqresults.write( str(", ".join([[ord(x) for x in strings] for strings in allpat]))) # patpitdurlens.append(len(allpat)) import matplotlib.pyplot as plt plt.switch_backend('agg') plt.figure() # plt.plot(rulepitdurlens)
def get_sequences(seqs, size): freq_seqs = seqmining.freq_seq_enum(seqs, size) return freq_seqs
if output[i]==2: str1=str1+'G' #Ground if output[i]==3: str1=str1+'p' #player if output[i]==4: str1=str1+'b' #boundary if output[i]==5: str1=str1+'C' #crowd if output[i]==6: str1=str1+'S' #sky s.append(str1) #print(str1) #print(len(s)) print (s) freq_seqs = seqmining.freq_seq_enum(s, 2) print(sorted(freq_seqs)) a=list(freq_seqs) a1=[] print(a) for i in range(len(freq_seqs)): s1="" for j in range(len(a[i][0])): s1 = s1+(a[i][0][j]) a1.append(s1) print(a1) file = open("t.txt","w") for i in range(len(a1)): file.write(a1[i]+"\n") file.close() '''ans={0:"pitch",1:"batsmen",2:"ground",3:"player",
def Apriori_four(data_tuple): seqs = ('caabc', 'abcb', 'cabc', 'abbca') freq_seqs = seqmining.freq_seq_enum(seqs, 2) print( sorted(freq_seqs) )
def get_seqence(corpus): report = seqmining.freq_seq_enum(corpus, 1) return sorted(report)
def handle(self, *args, **kwargs): TARGET_JOB = 71 SPLITID = 12 job_obj = Job.objects.filter(pk=TARGET_JOB)[0] split_obj = Split.objects.filter(pk=SPLITID)[0] training_df, test_df = get_encoded_logs(job_obj) test_df1 = test_df.copy() test_df2 = test_df.copy() test_df3 = test_df.copy() # todo: retrieve lime explanation # RETRIEVE&SAVE TS ts_exp_job, _ = Explanation.objects.get_or_create( type=ExplanationTypes.TEMPORAL_STABILITY.value, split=split_obj, predictive_model=job_obj.predictive_model, job=job_obj) ts = temporal_stability(ts_exp_job, training_df, test_df1, explanation_target=None) # RETRIEVE&SAVE LIMETS limets_exp_job, _ = Explanation.objects.get_or_create( type=ExplanationTypes.LIME.value, split=split_obj, predictive_model=job_obj.predictive_model, job=job_obj) lime_ts = lime_temporal_stability(limets_exp_job, training_df, test_df2, explanation_target=None) # SAVE GOLD gold = test_df3[['trace_id', 'label']] # todo: retrieve confusion matrix ts = { asdf: { uuu + '1' if uuu[-1:] == '_' else uuu: ts[asdf][uuu] for uuu in ts[asdf] } for asdf in ts } lime_ts = { asdf: { uuu + '1' if uuu[-1:] == '_' else uuu: lime_ts[asdf][uuu] for uuu in lime_ts[asdf] } for asdf in lime_ts } trace_ids = set(gold['trace_id']) confusion_matrix = { 'tp': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] == 'true' and ts[str(tid)] ['prefix_' + str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'tn': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] == 'false' and ts[str(tid)] ['prefix_' + str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'fp': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)][ 'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'true' and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] != ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'fn': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)][ 'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'false' and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] != ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ] } limefeats = { k: { key: [ element for element in sorted( [(pref, lime_ts[key] ['prefix_' + str(job_obj.encoding.prefix_length)][pref]['value'], lime_ts[key]['prefix_' + str(job_obj.encoding.prefix_length)] [pref]['importance']) for pref in lime_ts[key] ['prefix_' + str(job_obj.encoding.prefix_length)]], key=lambda x: (x[2], x[1]), reverse=True if k in ['tp', 'fp'] else False # reverse order of lime values if the prediction is negative ) ] for key in confusion_matrix[k] if 'prefix_' + str(job_obj.encoding.prefix_length) in lime_ts[key] } for k in confusion_matrix } freq_seqs = {'tp': {}, 'tn': {}, 'fp': {}, 'fn': {}} # todo: retrive patterns CONFUSION_MATRIX = ['tp', 'tn', 'fp', 'fn'] LIMEFEATS = { 'abs_lime': False, 'tp': 0.2, 'tn': 0.2, 'fp': 0.2, 'fn': 0.2, 'top': 10, 'outputfile': None } FREQ_SEQS = { 'tp': 10, 'tn': 10, 'fp': 10, 'fn': 10, 'top': 15, 'outputfile': None, 'RECOMPUTEDoutputfile': None, } ABSENCE = { 'tp': 0.1, 'tn': 0.1, 'fp': 0.1, 'fn': 0.1, 'ABSENCEoutputfile': None } MINING_METHOD = 'item_mining' print( 'Initial CONFUSION MATRIX:\n', *[ '\tlimefeats ' + KEY + ':' + str(len(limefeats[KEY])) for KEY in CONFUSION_MATRIX ], '\n', *[ '\tfreq_seqs ' + KEY + ':' + str(len(freq_seqs[KEY])) for KEY in CONFUSION_MATRIX ]) available_values = {} for KEY in CONFUSION_MATRIX: available_values[KEY] = {} for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: if event[0].split('_')[0] not in available_values[KEY]: available_values[KEY][event[0].split('_')[0]] = set() available_values[KEY][event[0].split('_')[0]].add(event[1]) filtered_limefeats = { KEY: { tid: [ event for event in limefeats[KEY][tid] if ((not LIMEFEATS['abs_lime']) and ( (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY]))) or (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY]) ] for tid in limefeats[KEY] } for KEY in CONFUSION_MATRIX } prefiltered_limefeats = { KEY: { tid: [ event for event in limefeats[KEY][tid] if ((not LIMEFEATS['abs_lime']) and ( (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY]))) or (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY]) ] for tid in limefeats[KEY] } for KEY in CONFUSION_MATRIX } filtered_limefeats_mine = { KEY: { tid: prefiltered_limefeats[KEY][tid][0:LIMEFEATS['top']] for tid in prefiltered_limefeats[KEY] } for KEY in CONFUSION_MATRIX } for KEY in CONFUSION_MATRIX: for k in list(filtered_limefeats[KEY]): if len(filtered_limefeats[KEY][k]) == 0: del filtered_limefeats[KEY][k] def tassellate_numbers(element): element = str(element) return str(element).split('.')[0][0] + '0' \ if \ '.' in str(element) \ and \ len(str(element)) <= 5 \ else \ str(element).split('.')[0][0:4] \ if \ '.' in str(element) \ and \ len(str(element)) >= 10 \ else \ element def retrieve_right_len(element, available_values): if '_' in element: return len(available_values[element.split('_')[0]]) else: retval = [] for attribute in available_values: if any([ str(element) == str(tassellate_numbers(value)) for value in available_values[attribute] ]): retval += [len(available_values[attribute])] return max(retval) def weight_freq_seqs(KEY, available_values, element, limefeats): print(element[0]) print( 'frequency:', element[1], ' * ', 'len w/out absences: ', len([el for el in element[0] if 'absence' not in el]), ' * ', 'sum of enumerator of possible values: ', sum([ retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el ]), ' / ', 'amount of examples in the field of confusion matrix: ', len(limefeats[KEY]), ' = ', (element[1] * len([el for el in element[0] if 'absence' not in el]) * sum([ retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el ])) / len(limefeats[KEY])) return ( element[1] # * # len([el for el in element[0] if 'absence' not in el]) * # sum([retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el]) ) / len(limefeats[KEY]) filtered_freq_seqs_old = { KEY: sorted([ element for element in freq_seqs[KEY] if weight_freq_seqs(KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY] ], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } prefiltered_freq_seqs = { KEY: sorted([ element for element in freq_seqs[KEY] if weight_freq_seqs(KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY] ], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } #todo: is this the actual topK? filtered_freq_seqs = { KEY: prefiltered_freq_seqs[KEY][0:FREQ_SEQS['top']] for KEY in CONFUSION_MATRIX } print( 'CONFUSION MATRIX after filtering:\n', *[ '\tlimefeats ' + KEY + ':' + str(len(filtered_limefeats[KEY])) for KEY in CONFUSION_MATRIX ], '\n', *[ '\tfreq_seqs ' + KEY + ':' + str(len(filtered_freq_seqs[KEY])) for KEY in CONFUSION_MATRIX ]) def printout_freq_seqs(output_obj, output_file, maxlinelength=5000): with open(output_file, 'w+') as f: f.write(prettyjson(output_obj, maxlinelength=maxlinelength)) if (LIMEFEATS['outputfile'] is not None or FREQ_SEQS['outputfile'] is not None): print('Start saving results..') if (LIMEFEATS['outputfile'] is not None): printout_freq_seqs(filtered_limefeats, LIMEFEATS['outputfile'], maxlinelength=5000) if (FREQ_SEQS['outputfile'] is not None): printout_freq_seqs(filtered_freq_seqs, FREQ_SEQS['outputfile'], maxlinelength=200) print('Results saved.') else: print('FILTERED_LIMEFEATS:\n', filtered_limefeats) print('FILTERED_FREQ_SEQS:\n', filtered_freq_seqs) print('Computing absence...') attributes = {} for KEY in CONFUSION_MATRIX: for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: attribute_name = event[0] if attribute_name not in attributes: attributes[attribute_name] = set() attributes[attribute_name].add(event[1]) attributes_occurrences = { 'tp': collections.Counter(), 'fp': collections.Counter(), 'tn': collections.Counter(), 'fn': collections.Counter() } for KEY in CONFUSION_MATRIX: found_stuff = [] for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: found_stuff += [tassellate_numbers(event[1])] attributes_occurrences[KEY].update(found_stuff) characterised_attributes_occurrences = {} for KEY in CONFUSION_MATRIX: characterised_attributes_occurrences[KEY] = {} for attribute in attributes: if attribute not in characterised_attributes_occurrences[KEY]: characterised_attributes_occurrences[KEY][ attribute] = dict() for attr in attributes[attribute]: characterised_attributes_occurrences[KEY][attribute][ tassellate_numbers(attr)] = 0 for KEY in CONFUSION_MATRIX: for occ in attributes_occurrences[KEY]: for attr in characterised_attributes_occurrences[KEY]: if occ in characterised_attributes_occurrences[KEY][attr]: characterised_attributes_occurrences[KEY][attr][ occ] = attributes_occurrences[KEY][occ] for attr in characterised_attributes_occurrences[KEY]: characterised_attributes_occurrences[KEY][attr]['Total'] = sum( [ characterised_attributes_occurrences[KEY][attr] [element] for element in characterised_attributes_occurrences[KEY][attr] ]) print('Absence computed.') print('The absence AFTER filtering is:\n', characterised_attributes_occurrences) print( 'RE-computing the sequence pattern result after applying the thresholds...' ) static_attr = [ # 'Age', # 'ClaimValue', # 'CType', # 'ClType', # 'PClaims', ] limefeats_static_dinamic = {} for KEY in CONFUSION_MATRIX: limefeats_static_dinamic[KEY] = {} for tid in filtered_limefeats[KEY]: limefeats_static_dinamic[KEY][tid] = { 'static': [], 'dynamic': [ att for att in filtered_limefeats[KEY][tid] if not any([ att[0].startswith(static_att) for static_att in static_attr ]) ] } current_static_attributes = [ att for att in filtered_limefeats[KEY][tid] if any([ att[0].startswith(static_att) for static_att in static_attr ]) ] for s_attr in static_attr: curr_attributes = [ att for att in current_static_attributes if att[0].startswith(s_attr) ] if len(curr_attributes) > 0: if KEY in ['tp', 'fp']: limefeats_static_dinamic[KEY][tid]['static'] += [ max(curr_attributes, key=lambda x: x[2]) ] elif KEY in ['tn', 'fn']: limefeats_static_dinamic[KEY][tid]['static'] += [ max(curr_attributes, key=lambda x: x[2]) ] else: print('Something bad happened') dynamic_data = { KEY: { tid: [ # (element[0].split('_')[0] + '_' + element[1]) (element[0] + '_' + element[1]) for element in sorted( [ k for k in limefeats_static_dinamic[KEY][tid] ['dynamic'] ], # key=lambda x: (x[0].split('_')[1], x[0].split('_')[0]) key=lambda x: x[0]) ] for tid in limefeats_static_dinamic[KEY] if len(limefeats_static_dinamic[KEY][tid]['dynamic']) > 0 } for KEY in CONFUSION_MATRIX } static_data = { KEY: { tid: [ (element[0].split('_')[0] + '_' + tassellate_numbers(element[1])) # (element[0] + '_' + tassellate_numbers(element[1])) for element in sorted([ k for k in limefeats_static_dinamic[KEY][tid]['static'] ], key=lambda x: (x[0].split('_')[1], x[ 0].split('_')[0])) ] for tid in limefeats_static_dinamic[KEY] if len(limefeats_static_dinamic[KEY][tid]['static']) > 0 } for KEY in CONFUSION_MATRIX } data = {} for KEY in CONFUSION_MATRIX: data[KEY] = {} for tid in limefeats[KEY]: if tid in static_data[KEY] and tid in dynamic_data[KEY]: data[KEY][ tid] = static_data[KEY][tid] + dynamic_data[KEY][tid] elif tid in static_data[KEY]: data[KEY][tid] = static_data[KEY][tid] elif tid in dynamic_data[KEY]: data[KEY][tid] = dynamic_data[KEY][tid] if (MINING_METHOD == 'seq_mining'): freq_seqs_after_filter = { 'tp': sorted( seqmining.freq_seq_enum( [data['tp'][tid] for tid in data['tp']], 2)), 'tn': sorted( seqmining.freq_seq_enum( [data['tn'][tid] for tid in data['tn']], 2)), 'fp': sorted( seqmining.freq_seq_enum( [data['fp'][tid] for tid in data['fp']], 2)), 'fn': sorted( seqmining.freq_seq_enum( [data['fn'][tid] for tid in data['fn']], 2)), } if (MINING_METHOD == 'item_mining'): freq_seqs_after_filter = { 'tp': itemmining.relim(itemmining.get_relim_input( [data['tp'][tid] for tid in data['tp']]), min_support=2), 'tn': itemmining.relim(itemmining.get_relim_input( [data['tn'][tid] for tid in data['tn']]), min_support=2), 'fp': itemmining.relim(itemmining.get_relim_input( [data['fp'][tid] for tid in data['fp']]), min_support=2), 'fn': itemmining.relim(itemmining.get_relim_input( [data['fn'][tid] for tid in data['fn']]), min_support=2), } freq_seqs_after_filter = { KEY: [(tuple(element), freq_seqs_after_filter[KEY][element]) for element in freq_seqs_after_filter[KEY]] for KEY in CONFUSION_MATRIX } filtered_freq_seqs_after_filter_old = { KEY: sorted([[ element[0], weight_freq_seqs(KEY, available_values, element, limefeats) ] for element in freq_seqs_after_filter[KEY] if weight_freq_seqs( KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY]], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } # todo: filter topK filtered_freq_seqs_after_filter = { KEY: filtered_freq_seqs_after_filter_old[KEY][0:FREQ_SEQS['top']] for KEY in CONFUSION_MATRIX } print('Sequence pattern recomputed successfully.') if (FREQ_SEQS['outputfile'] is not None): print('Start saving results..') printout_freq_seqs(filtered_freq_seqs_after_filter, FREQ_SEQS['RECOMPUTEDoutputfile'], maxlinelength=200) print('Results saved.') else: print('RECOMPUTED_FREQ_SEQS:\n', filtered_freq_seqs_after_filter) print('Done, cheers!') return confusion_matrix, data, freq_seqs_after_filter, filtered_freq_seqs_after_filter
def seqs_mining(self): freq_seqs = seqmining.freq_seq_enum(self.transactions, self.min_sup) return sorted(freq_seqs)
def __artist_freq_seq(self, artist_patterns, min_size): freq_artist = seqmining.freq_seq_enum(artist_patterns, min_size) self.profile['artist_pattern'] = self.__seq_pattern(freq_artist)
import pandas as pd # See: https://github.com/bartdag/pymining from pymining import itemmining, assocrules, seqmining enrollment = pd.read_csv('course-enrollment.csv') grouped = (enrollment.groupby('user_id')['course_id'].agg({ 'course_id': lambda x: x.tolist() }).reset_index()[-40000:]) events = grouped.course_id.values.tolist() relim_input = itemmining.get_relim_input(events) report = itemmining.relim(relim_input, min_support=2) print('Associative rules:') rules = assocrules.mine_assoc_rules(report, min_support=5, min_confidence=0.6) rules_df = pd.DataFrame.from_records( rules, columns=['from', 'to', 'support', 'confidence']) print(rules_df.sort_values('support', ascending=False).head(10)) print('Frequent sequences: ') freq_seqs = seqmining.freq_seq_enum(events, 5) freq_seqs_df = pd.DataFrame.from_records(list(freq_seqs), columns=['sequence', 'support']) freq_seqs_df['sequence_len'] = freq_seqs_df.sequence.apply(len) print(freq_seqs_df[freq_seqs_df.sequence_len > 1].sort_values( ['sequence_len', 'support'], ascending=False).head(10))