Esempio n. 1
0
def find_frequent(data, mis, sdc):
    frequent_items = list(mis.keys())
    return_list = []

    # check each item to see if it's support is greater than MIS(item)
    total_transactions = float( len(data) )
    new_freq_items = []

    for item in frequent_items:
        support = float(util.actual_support(data, [[item]])) / total_transactions
        if support >= mis[item]:
            return_list.append( (item, mis[item]) )


    return return_list
Esempio n. 2
0
def main():
    data = parsefile.parse_input_file(INPUT_FILE_NAME)
    mis, sdc = parsefile.parse_param_file(PARAM_FILE_NAME)
    support_dict_for_elements = util.calculate_support_for_elements(list(mis), data)

    """ Step 1: Find frequent items """
    frequent_items = find_frequent(data, mis, sdc)

    """ Step 2: Sort frequent items in ascending order according to their MIS value """
    frequent_items.sort(key=lambda x: x[1])

    # remove MIS values, and just retain item ID
    frequent_items = [x[0] for x in frequent_items]

    # add level 1 items
    final_output[1] = []
    for item in frequent_items:
        final_output[1].append(([[item]], util.actual_support(data, [[item]])))

    for item in frequent_items:
        item_mis_as_int = math.ceil(mis[item]*len(data))
        transaction_subset = util.get_S_K_for_item(data, item, sdc, list(mis))

        sequence_generator = util.SequenceGenerator(item, item_mis_as_int, transaction_subset, frequent_items, list(mis),  support_dict_for_elements, sdc)
        for i,j in sequence_generator.sequence_transaction_list:
            temp = [item for sublist in i for item in sublist]

            if len(temp) not in final_output:
                final_output[len(temp)] = [(i, len(j))]
            else:
                final_output[len(temp)].append((i, len(j)))

        data = util.remove_item_from_transactions(item, data)


    with open(OUTPUT_FILE_NAME, 'w') as output_file:
        for k in sorted(final_output.keys()):
            output_file.write('\nThe number of length '+str(k)+' sequential patterns is '+ str(len(final_output[k])) + '\n\n')
            for patterns in final_output[k]:
                output_file.write('Pattern: '+pprint_result(str(patterns[0]))+' Count: '+str(patterns[1])+'\n')