def enumerate_frequent_seq(elements, support_threshold): '''Recursively traverse the sequence lattice, generating frequent n+1-length sequences from n-length sequences provided in the id_list parameter.''' frequent_elements = _KeyDefaultDict(Element) for element_index_i, seq_i in enumerate(elements.keys()): frequent_elements_inner = _KeyDefaultDict(Element) # print((list(elements.keys()))) # print(elements.keys()[element_index_i + 1]) for element_index_j, seq_j in enumerate( list(elements.keys())[element_index_i + 1:]): R = temporal_join(elements[seq_i], elements[seq_j]) for seq, element in list(R.items()): support = len(set([event.sid for event in element.events])) if support >= support_threshold: frequent_elements_inner[seq] |= element for seq, element in list(frequent_elements_inner.items()): frequent_elements[seq] |= element for seq, element in list( enumerate_frequent_seq(frequent_elements_inner, support_threshold).items()): frequent_elements[seq] |= element return frequent_elements
def mine(sequences, support_threshold): '''SPADE (Zaki 2001) is performed in three distinct steps: 1. Identify frequent single elements. 2. Identify frequent two-element sequences. 3. Identify all remaining sequences of three elements or more. ''' # parse input sequences into individual item Elements elements = _KeyDefaultDict(Element) for sid, eid, itemset in sequences: for item in itemset: elements[tuple(item)] |= Element(tuple(item), Event(sid=sid, eid=eid)) # identify frequent single elements elements = subset_to_support(elements, support_threshold) # identify frequent two-element sequences using a horizontal database freq_elements_len_eq_2 = count_frequent_two_seq(elements, support_threshold) # generate ID lists for frequent two-element sequences discovered above elements_len_eq_2 = _KeyDefaultDict(Element) for two_seq in freq_elements_len_eq_2: R = temporal_join(elements[tuple(two_seq[0])], elements[tuple(two_seq[1])]) for seq, element in list(R.items()): support = len(set([event.sid for event in element.events])) if support >= support_threshold: elements_len_eq_2[seq] |= element # identify and generate ID lists for all remaining sequences freq = enumerate_frequent_seq(elements_len_eq_2, support_threshold) # collect all identified sequences of any length for seq, element in list(elements_len_eq_2.items()): freq[seq] |= element for seq, element in list(elements.items()): freq[seq] |= element # return frequent sequences return freq
def temporal_join(element_i, element_j): '''Given two elements, return a dictionary of new elements indexed by their corresponding item sequences. ''' join_results = _KeyDefaultDict(Element) for event_index_i, event_i in enumerate(element_i.events): for event_index_j, event_j in enumerate(element_j.events): if event_i.sid == event_j.sid: sid = event_i.sid superseqs = tuple() superseqs_events = tuple() # these two atoms occur in the same sequence # if they occur at different times (different eids), then # their combination atom has the later eid by Corollary 1 (Zaki 2001) if event_i.eid > event_j.eid: superseq = element_j.seq + tuple(element_i.seq[-1]) superseq_event = Event(sid=sid, eid=event_i.eid) join_results[superseq] |= Element(superseq, superseq_event) elif event_i.eid < event_j.eid: superseq = element_i.seq + tuple(element_j.seq[-1]) superseq_event = Event(sid=sid, eid=event_j.eid) join_results[superseq] |= Element(superseq, superseq_event) elif element_i.seq[-1] != element_j.seq[-1]: superseq_event = Event(sid=sid, eid=event_j.eid) # for coincident atoms, join the last element of one atom to the other # ensure that the itemset is sorted superseq_i = element_i.seq[:-1] + tuple([ ''.join( sorted(set(element_i.seq[-1] + element_j.seq[-1]))) ]) join_results[superseq_i] |= Element( superseq_i, superseq_event) superseq_j = element_j.seq[:-1] + tuple([ ''.join( sorted(set(element_i.seq[-1] + element_j.seq[-1]))) ]) # if both resulting atoms are identical, only add it once if superseq_j != superseq_i: join_results[superseq_j] |= Element( superseq_j, superseq_event) return join_results
def subset_to_support(elements, support_threshold): '''Given an IdList, return an IdList containing only those atoms which meet the support threshold. ''' subsetted = _KeyDefaultDict(Element) for element_name, element in list(elements.items()): support = len(set([event.sid for event in element.events])) if support >= support_threshold: subsetted[element_name] = element return subsetted