def enumerate_frequent_seq(elements,support_threshold): '''Recursively traverse the sequence lattice, generating frequent n+1-length sequences from n-length sequences provided in the id_list parameter.''' frequent_elements = _KeyDefaultDict(Element) for element_index_i,seq_i in enumerate(elements.keys()): frequent_elements_inner = _KeyDefaultDict(Element) for element_index_j,seq_j in enumerate(list(elements.keys())[element_index_i+1:]): R = temporal_join(elements[seq_i],elements[seq_j]) for seq,element in R.items(): support = len(set([event.sid for event in element.events])) if support >= support_threshold: frequent_elements_inner[seq] |= element for seq,element in frequent_elements_inner.items(): frequent_elements[seq] |= element for seq,element in enumerate_frequent_seq(frequent_elements_inner,support_threshold).items(): frequent_elements[seq] |= element return frequent_elements
def enumerate_frequent_seq(elements,support_threshold): '''Recursively traverse the sequence lattice, generating frequent n+1-length sequences from n-length sequences provided in the id_list parameter.''' frequent_elements = _KeyDefaultDict(Element) for element_index_i,seq_i in enumerate(elements.keys()): frequent_elements_inner = _KeyDefaultDict(Element) for element_index_j,seq_j in enumerate(elements.keys()[element_index_i+1:]): R = temporal_join(elements[seq_i],elements[seq_j]) for seq,element in R.items(): support = len(set([event.sid for event in element.events])) if support >= support_threshold: frequent_elements_inner[seq] |= element for seq,element in frequent_elements_inner.items(): frequent_elements[seq] |= element for seq,element in enumerate_frequent_seq(frequent_elements_inner,support_threshold).items(): frequent_elements[seq] |= element return frequent_elements
def mine(sequences,support_threshold): '''SPADE (Zaki 2001) is performed in three distinct steps: 1. Identify frequent single elements. 2. Identify frequent two-element sequences. 3. Identify all remaining sequences of three elements or more. ''' # parse input sequences into individual item Elements elements = _KeyDefaultDict(Element) for sid,eid,itemset in sequences: for item in itemset: elements[tuple(item)] |= Element(tuple(item),Event(sid=sid,eid=eid)) # print(sid,eid,itemset) # identify frequent single elements elements = subset_to_support(elements,support_threshold) # identify frequent two-element sequences using a horizontal database freq_elements_len_eq_2 = count_frequent_two_seq(elements,support_threshold) # generate ID lists for frequent two-element sequences discovered above elements_len_eq_2 = _KeyDefaultDict(Element) for two_seq in freq_elements_len_eq_2: R = temporal_join(elements[tuple(two_seq[0])],elements[tuple(two_seq[1])]) for seq,element in R.items(): support = len(set([event.sid for event in element.events])) if support >= support_threshold: elements_len_eq_2[seq] |= element # identify and generate ID lists for all remaining sequences freq = enumerate_frequent_seq(elements_len_eq_2,support_threshold) # collect all identified sequences of any length for seq,element in elements_len_eq_2.items(): freq[seq] |= element for seq,element in elements.items(): freq[seq] |= element # return frequent sequences return freq
def mine(sequences,support_threshold): '''SPADE (Zaki 2001) is performed in three distinct steps: 1. Identify frequent single elements. 2. Identify frequent two-element sequences. 3. Identify all remaining sequences of three elements or more. ''' # parse input sequences into individual item Elements elements = _KeyDefaultDict(Element) for sid,eid,itemset in sequences: for item in itemset: elements[tuple(item)] |= Element(tuple(item),Event(sid=sid,eid=eid)) # identify frequent single elements elements = subset_to_support(elements,support_threshold) # identify frequent two-element sequences using a horizontal database freq_elements_len_eq_2 = count_frequent_two_seq(elements,support_threshold) # generate ID lists for frequent two-element sequences discovered above elements_len_eq_2 = _KeyDefaultDict(Element) for two_seq in freq_elements_len_eq_2: R = temporal_join(elements[tuple(two_seq[0])],elements[tuple(two_seq[1])]) for seq,element in R.items(): support = len(set([event.sid for event in element.events])) if support >= support_threshold: elements_len_eq_2[seq] |= element # identify and generate ID lists for all remaining sequences freq = enumerate_frequent_seq(elements_len_eq_2,support_threshold) # collect all identified sequences of any length for seq,element in elements_len_eq_2.items(): freq[seq] |= element for seq,element in elements.items(): freq[seq] |= element # return frequent sequences return freq
def subset_to_support(elements,support_threshold): '''Given an IdList, return an IdList containing only those atoms which meet the support threshold. ''' subsetted = _KeyDefaultDict(Element) for element_name,element in elements.items(): support = len(set([event.sid for event in element.events])) if support >= support_threshold: subsetted[element_name] = element return subsetted
def temporal_join(element_i,element_j): '''Given two elements, return a dictionary of new elements indexed by their corresponding item sequences. ''' join_results = _KeyDefaultDict(Element) for event_index_i,event_i in enumerate(element_i.events): for event_index_j,event_j in enumerate(element_j.events): if event_i.sid == event_j.sid: sid = event_i.sid superseqs = tuple() superseqs_events = tuple() # these two atoms occur in the same sequence # if they occur at different times (different eids), then # their combination atom has the later eid by Corollary 1 (Zaki 2001) if event_i.eid > event_j.eid: superseq = element_j.seq + tuple(element_i.seq[-1]) superseq_event = Event(sid=sid,eid=event_i.eid) join_results[superseq] |= Element(superseq,superseq_event) elif event_i.eid < event_j.eid: superseq = element_i.seq + tuple(element_j.seq[-1]) superseq_event = Event(sid=sid,eid=event_j.eid) join_results[superseq] |= Element(superseq,superseq_event) elif element_i.seq[-1] != element_j.seq[-1]: superseq_event = Event(sid=sid,eid=event_j.eid) # for coincident atoms, join the last element of one atom to the other # ensure that the itemset is sorted superseq_i = element_i.seq[:-1] + tuple([ ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1]))) ]) join_results[superseq_i] |= Element(superseq_i,superseq_event) superseq_j = element_j.seq[:-1] + tuple([ ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1]))) ]) # if both resulting atoms are identical, only add it once if superseq_j != superseq_i: join_results[superseq_j] |= Element(superseq_j,superseq_event) return join_results