コード例 #1
0
def enumerate_frequent_seq(elements,support_threshold):
    '''Recursively traverse the sequence lattice, generating frequent n+1-length
    sequences from n-length sequences provided in the id_list parameter.'''

    frequent_elements = _KeyDefaultDict(Element)

    for element_index_i,seq_i in enumerate(elements.keys()):

        frequent_elements_inner = _KeyDefaultDict(Element)
            
        for element_index_j,seq_j in enumerate(list(elements.keys())[element_index_i+1:]):

            R = temporal_join(elements[seq_i],elements[seq_j])

            for seq,element in R.items():
                support = len(set([event.sid for event in element.events]))
                if support >= support_threshold:
                    frequent_elements_inner[seq] |= element


        for seq,element in frequent_elements_inner.items():
            frequent_elements[seq] |= element

        for seq,element in enumerate_frequent_seq(frequent_elements_inner,support_threshold).items():
            frequent_elements[seq] |= element

    return frequent_elements
コード例 #2
0
ファイル: miner.py プロジェクト: shahin/sequenceminer
def enumerate_frequent_seq(elements,support_threshold):
    '''Recursively traverse the sequence lattice, generating frequent n+1-length
    sequences from n-length sequences provided in the id_list parameter.'''

    frequent_elements = _KeyDefaultDict(Element)

    for element_index_i,seq_i in enumerate(elements.keys()):

        frequent_elements_inner = _KeyDefaultDict(Element)
            
        for element_index_j,seq_j in enumerate(elements.keys()[element_index_i+1:]):

            R = temporal_join(elements[seq_i],elements[seq_j])

            for seq,element in R.items():
                support = len(set([event.sid for event in element.events]))
                if support >= support_threshold:
                    frequent_elements_inner[seq] |= element


        for seq,element in frequent_elements_inner.items():
            frequent_elements[seq] |= element

        for seq,element in enumerate_frequent_seq(frequent_elements_inner,support_threshold).items():
            frequent_elements[seq] |= element

    return frequent_elements
コード例 #3
0
def mine(sequences,support_threshold):
    '''SPADE (Zaki 2001) is performed in three distinct steps:
    1. Identify frequent single elements.
    2. Identify frequent two-element sequences.
    3. Identify all remaining sequences of three elements or more.
    '''

    # parse input sequences into individual item Elements
    elements = _KeyDefaultDict(Element) 
    for sid,eid,itemset in sequences:
        for item in itemset:
            elements[tuple(item)] |= Element(tuple(item),Event(sid=sid,eid=eid))

       # print(sid,eid,itemset)
    # identify frequent single elements
    elements = subset_to_support(elements,support_threshold)

    # identify frequent two-element sequences using a horizontal database
    freq_elements_len_eq_2 = count_frequent_two_seq(elements,support_threshold)

    # generate ID lists for frequent two-element sequences discovered above
    elements_len_eq_2 = _KeyDefaultDict(Element)

    for two_seq in freq_elements_len_eq_2:

        R = temporal_join(elements[tuple(two_seq[0])],elements[tuple(two_seq[1])])

        for seq,element in R.items():
            support = len(set([event.sid for event in element.events]))
            if support >= support_threshold:
                elements_len_eq_2[seq] |= element

    # identify and generate ID lists for all remaining sequences
    freq = enumerate_frequent_seq(elements_len_eq_2,support_threshold)

    # collect all identified sequences of any length
    for seq,element in elements_len_eq_2.items():
        freq[seq] |= element

    for seq,element in elements.items():
        freq[seq] |= element

    # return frequent sequences
    return freq
コード例 #4
0
ファイル: miner.py プロジェクト: shahin/sequenceminer
def mine(sequences,support_threshold):
    '''SPADE (Zaki 2001) is performed in three distinct steps:
    1. Identify frequent single elements.
    2. Identify frequent two-element sequences.
    3. Identify all remaining sequences of three elements or more.
    '''

    # parse input sequences into individual item Elements
    elements = _KeyDefaultDict(Element) 

    for sid,eid,itemset in sequences:
        for item in itemset:
            elements[tuple(item)] |= Element(tuple(item),Event(sid=sid,eid=eid))

    # identify frequent single elements
    elements = subset_to_support(elements,support_threshold)

    # identify frequent two-element sequences using a horizontal database
    freq_elements_len_eq_2 = count_frequent_two_seq(elements,support_threshold)

    # generate ID lists for frequent two-element sequences discovered above
    elements_len_eq_2 = _KeyDefaultDict(Element)

    for two_seq in freq_elements_len_eq_2:

        R = temporal_join(elements[tuple(two_seq[0])],elements[tuple(two_seq[1])])

        for seq,element in R.items():
            support = len(set([event.sid for event in element.events]))
            if support >= support_threshold:
                elements_len_eq_2[seq] |= element

    # identify and generate ID lists for all remaining sequences
    freq = enumerate_frequent_seq(elements_len_eq_2,support_threshold)

    # collect all identified sequences of any length
    for seq,element in elements_len_eq_2.items():
        freq[seq] |= element

    for seq,element in elements.items():
        freq[seq] |= element

    # return frequent sequences
    return freq
コード例 #5
0
def subset_to_support(elements,support_threshold):
    '''Given an IdList, return an IdList containing only those atoms which
    meet the support threshold.
    '''

    subsetted = _KeyDefaultDict(Element)

    for element_name,element in elements.items():
        support = len(set([event.sid for event in element.events]))
        if support >= support_threshold:
            subsetted[element_name] = element
                    
    return subsetted
コード例 #6
0
ファイル: miner.py プロジェクト: shahin/sequenceminer
def subset_to_support(elements,support_threshold):
    '''Given an IdList, return an IdList containing only those atoms which
    meet the support threshold.
    '''

    subsetted = _KeyDefaultDict(Element)

    for element_name,element in elements.items():
        support = len(set([event.sid for event in element.events]))
        if support >= support_threshold:
            subsetted[element_name] = element
                    
    return subsetted
コード例 #7
0
def temporal_join(element_i,element_j):
    '''Given two elements, return a dictionary of new elements indexed by
    their corresponding item sequences.
    '''

    join_results = _KeyDefaultDict(Element)
    
    for event_index_i,event_i in enumerate(element_i.events):
        for event_index_j,event_j in enumerate(element_j.events):
    
            if event_i.sid == event_j.sid:
                                        
                sid = event_i.sid
                superseqs = tuple()
                superseqs_events = tuple()
            
                # these two atoms occur in the same sequence
                # if they occur at different times (different eids), then
                # their combination atom has the later eid by Corollary 1 (Zaki 2001)
                if event_i.eid > event_j.eid:
                    superseq = element_j.seq + tuple(element_i.seq[-1])
                    superseq_event = Event(sid=sid,eid=event_i.eid)
                    join_results[superseq] |= Element(superseq,superseq_event)

                elif event_i.eid < event_j.eid:
                    superseq = element_i.seq + tuple(element_j.seq[-1])
                    superseq_event = Event(sid=sid,eid=event_j.eid)
                    join_results[superseq] |= Element(superseq,superseq_event)

                elif element_i.seq[-1] != element_j.seq[-1]:

                    superseq_event = Event(sid=sid,eid=event_j.eid)

                    # for coincident atoms, join the last element of one atom to the other
                    # ensure that the itemset is sorted
                    superseq_i = element_i.seq[:-1] + tuple([
                        ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                        ])
                    join_results[superseq_i] |= Element(superseq_i,superseq_event)

                    superseq_j = element_j.seq[:-1] + tuple([
                        ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                        ])

                    # if both resulting atoms are identical, only add it once
                    if superseq_j != superseq_i:
                        join_results[superseq_j] |= Element(superseq_j,superseq_event)
                
    return join_results
コード例 #8
0
ファイル: miner.py プロジェクト: shahin/sequenceminer
def temporal_join(element_i,element_j):
    '''Given two elements, return a dictionary of new elements indexed by
    their corresponding item sequences.
    '''

    join_results = _KeyDefaultDict(Element)
    
    for event_index_i,event_i in enumerate(element_i.events):
        for event_index_j,event_j in enumerate(element_j.events):
    
            if event_i.sid == event_j.sid:
                                        
                sid = event_i.sid
                superseqs = tuple()
                superseqs_events = tuple()
            
                # these two atoms occur in the same sequence
                # if they occur at different times (different eids), then
                # their combination atom has the later eid by Corollary 1 (Zaki 2001)
                if event_i.eid > event_j.eid:
                    superseq = element_j.seq + tuple(element_i.seq[-1])
                    superseq_event = Event(sid=sid,eid=event_i.eid)
                    join_results[superseq] |= Element(superseq,superseq_event)

                elif event_i.eid < event_j.eid:
                    superseq = element_i.seq + tuple(element_j.seq[-1])
                    superseq_event = Event(sid=sid,eid=event_j.eid)
                    join_results[superseq] |= Element(superseq,superseq_event)

                elif element_i.seq[-1] != element_j.seq[-1]:

                    superseq_event = Event(sid=sid,eid=event_j.eid)

                    # for coincident atoms, join the last element of one atom to the other
                    # ensure that the itemset is sorted
                    superseq_i = element_i.seq[:-1] + tuple([
                        ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                        ])
                    join_results[superseq_i] |= Element(superseq_i,superseq_event)

                    superseq_j = element_j.seq[:-1] + tuple([
                        ''.join(sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                        ])

                    # if both resulting atoms are identical, only add it once
                    if superseq_j != superseq_i:
                        join_results[superseq_j] |= Element(superseq_j,superseq_event)
                
    return join_results