Python import_dataset Examples, datasets.import_datasets.import_dataset Python Examples

Example #1

0

Show file

    def test_pgbs(self):

        basket_sets = im.import_dataset("chess").sample(100) # limit due to testing

        original_database = basket_sets.copy()
        modified_database = basket_sets.copy()

        # We partition the Chess databases into 5 bins, then randomly select 2 itemsets from each bin,
        # assign the minimum support threshold as the minimum support given in the support range
        # This takes a long time, so will just use their values. Table 3: Support ranges for databases.
        sigma_min = min([0.6001, 0.6136, 0.6308, 0.6555, 0.6974])

        sigma_model = 0.5
        original_IS = fpgrowth(original_database, min_support=sigma_model, use_colnames=True)

        # Get 10 sensitive itemsets
        sensitive_IS = original_IS.sample(10)
        sensitive_IS_PGBS = pd.DataFrame({
            'itemset': [list(IS) for IS in sensitive_IS["itemsets"]],
            'threshold': [sigma_min for _ in sensitive_IS["support"]]})

        pgbs(modified_database, sensitive_IS_PGBS)

        # Give all itemsets and supports in D (original_database)
        a = original_IS

        # Give all itemsets and supports in D' (modified_database)
        b = fpgrowth(modified_database, min_support=sigma_model, use_colnames=True)

        il = information_loss(a, b)
        self.assertEqual(0.5542, round(il, 4))

Example #2

0

Show file

File: test_hiding_failure.py Project: jamesdu0504/760GroupProject

    def setUpClass(cls):

        # Get toy data, WARNING! Had to change relative reference for this to work
        cls.basket_sets = im.import_dataset("toydata")

        # Abuse FPGrowth using absolute smallest min support to get all itemsets as frequent itemsets
        sigma_model = 1 / len(cls.basket_sets)
        cls.original_IS = fpgrowth(cls.basket_sets,
                                   min_support=sigma_model,
                                   use_colnames=True,
                                   verbose=False)

        # Compute closed itemsets of original data base
        cls.original_Closed_IS, _ = get_closed_itemsets(
            cls.basket_sets, sigma_model)

Example #3

0

Show file

File: Individual_test.py Project: jamesdu0504/760GroupProject

def main(datasets):
    for dataset in datasets:
        sigma_model = datasets[dataset][0]
        sigma_min = datasets[dataset][1]
        k_freq = 30

        for sigma_lower in [0.7, 0.725, 0.75, 0.775]:
            #Load dataset
            data = im.import_dataset(dataset)
            data = data.astype('bool')  #This may be needed for some datasets
            print("\n", dataset, "imported\n")

            #Convert to closed itemsets
            current_model, freq_model = get_closed_itemsets(data, sigma_model)
            freq_original = freq_model.loc[freq_model["support"] >= sigma_min]
            sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq)

            #Convert to pandas format for MRPS input
            sensitive_IS_pandas = pd.DataFrame(
                data=[(sensitive_IS),
                      np.full((len(sensitive_IS)), sigma_min),
                      np.full((len(sensitive_IS)), sigma_lower)]).T

            sensitive_IS_pandas.columns = [
                'itemset', 'upper_threshold', 'lower_threshold'
            ]

            #Run RPS random threshold
            sanitized_closed_IS = rps_two_thresholds(
                model=current_model, sensitiveItemsets=sensitive_IS_pandas)

            #Reproduce frequent itemsets
            sanitized_DB = itemsets_from_closed_itemsets(
                closed_itemsets=sanitized_closed_IS,
                possible_itemsets=freq_model['itemsets'])

            #Plot support graphs
            dual_support_graph_distribution(
                freq_model, sanitized_DB, sigma_model, dataset +
                "_presentation_" + str(sigma_lower) + "_" + str(k_freq))

            information_l = information_loss(freq_model.copy(), sanitized_DB)
            print(sigma_lower, information_l)

Example #4

0

Show file

File: dataset_characteristics.py Project: jamesdu0504/760GroupProject

def main(datasets):
    df = pd.DataFrame(columns=[
        'Dataset Name', 'Number of transactions', 'Number of Unique items',
        'Minimum Transaction Length', 'Maximum Transaction Length',
        'Average Transaction Length'
    ])

    for dataset_name in datasets:
        print("Analysing", dataset_name)
        data = im.import_dataset(dataset_name)

        data = data.astype('bool')

        average = 0
        minimum = 100000
        maximum = 0
        for _, row in data.iterrows():
            transaction_len = sum(row)
            #Minimum transaction length
            if minimum > transaction_len:
                minimum = transaction_len

            #Maximum transaction length
            if maximum < transaction_len:
                maximum = transaction_len

            #Average transaction length
            average += transaction_len

        new_row = {
            'Dataset Name': dataset_name,
            'Number of transactions': data.shape[0],
            'Number of Unique items': data.shape[1],
            'Minimum Transaction Length': minimum,
            'Maximum Transaction Length': maximum,
            'Average Transaction Length': average / data.shape[0]
        }

        df = df.append(new_row, ignore_index=True)

    print(df)
    return df

Example #5

0

Show file

File: support_distribution_graph.py Project: jamesdu0504/760GroupProject

def main(dataset, min_sup):
    print("Processing:", dataset)
    basket_sets = im.import_dataset(dataset)

    #Plot the support distribution
    frequent_itemsets = fpgrowth(basket_sets,
                                 min_support=min_sup,
                                 use_colnames=True)
    support_graph_distribution(frequent_itemsets, min_sup, dataset)

    #Example of plotting the dual distributions
    # #Plot the dual distribution by randomly reducing some values for testing
    # copy = frequent_itemsets.copy()
    # copy.dropna(inplace=True) #This is needed for some reason?
    # for i in range(copy.shape[0]//2):
    #     copy.loc[random.randint(0, copy.shape[0]), ["support"]] = copy.loc[random.randint(0, copy.shape[0]-1), ["support"]]/2

    # dual_support_graph_distribution(frequent_itemsets, copy, min_sup, dataset)


#Manually assigning minimum supports
# datasets = {"toydata": 0.005,
#             "BMS1": 0.00085,
#             "BMS2": 0.0005,
#             "uci_retail": 0.005,
#             "mushroom": 0.1,
#             "Belgian_retail": 0.0005,
#             "chess": 0.7,
#             "connect": 0.8,
#             "pumsb": 0.83,
#             "pumsb_star": 0.38,
#             "T40I10D100K": 0.011,
#             "T10I4D100K": 0.001,
#             "accidents": 0.38,
#             "instacart": 0.005}

# for key, value in datasets.items():
#     main(key, value)

Example #6

0

Show file

File: test_closed_itemset.py Project: jamesdu0504/760GroupProject

def main(dataset_name, threshold):
    data = im.import_dataset(dataset_name)

    CI_n = get_closed_itemsets_new(data, threshold)[0]
    CI_o = get_closed_itemsets(data, threshold)[0]

    same = []
    have = []
    missing = []
    for CI in CI_o:
        if CI in CI_n:
            same += [CI]
        else:
            if CI_o[CI] > threshold:
                missing += [CI_o[CI]]

    for CI in CI_n:
        if not CI in CI_o:
            have += [CI]

    print("Similar closed:", len(same))
    print("Need to remove:", len(have))
    print("Need to add to:", len(missing))

Example #7

0

Show file

def main(datasets, algorithm, i):
    #Create the base of a table
    table_11 = pd.DataFrame(columns=[
        'Model', 'Support threshold', 'Model threshold', 'Sensitive itemsets',
        'Number of FI before sanitization',
        'Number of FI containing an element of S before sanitization',
        'Information loss expected', 'Number of FI after sanitization',
        'Number of FI containing an element of S after RPS', 'Hiding failure',
        'Artifactual patterns', 'Misses cost', 'Side effects factor',
        'Information loss', 'RPS Time'
    ])

    table_10 = pd.DataFrame(columns=[
        'Dataset', 'Model threshold', 'Number of Closed frequent itemsets',
        'Number of frequent itemsets', 'Time closed itemsets'
    ])

    #Loop through datasets
    for dataset in datasets:
        sigma_model = datasets[dataset][0]

        #Load dataset
        data = im.import_dataset(dataset)
        data = data.astype('bool')  #This may be needed for some datasets
        print("\n", dataset, "imported\n")

        #Start total timer
        total_time_start = time.time()

        #Convert to closed itemsets
        current_model, freq_model = get_closed_itemsets(data, sigma_model)

        new_row = {
            'Dataset': dataset,
            'Model threshold': sigma_model,
            'Number of Closed frequent itemsets': len(current_model),
            'Number of frequent itemsets': len(freq_model),
            'Time closed itemsets': time.time() - total_time_start
        }

        print(new_row)
        table_10 = table_10.append(new_row, ignore_index=True)
        table_10.to_csv('table_10.csv')

        #Loop through support thresholds
        for sigma_min in datasets[dataset][1:]:
            print("\n", dataset, "FI:", sigma_min)

            #Find original frequent itemsets at frequency sigma min
            freq_original = freq_model.loc[freq_model["support"] >= sigma_min]

            for k_freq in [10, 30]:
                print("-", dataset, ":", k_freq, "Sensitive itemsets")

                #Copy the model so we can edit it directly
                copied_model = current_model.copy()

                #We pick sensitive itemsets here
                sensitive_IS = get_top_k_sensitive_itemsets(
                    freq_original, k_freq)
                num_FI_containing_S = count_FI_containing_S(
                    freq_original, sensitive_IS)

                if algorithm == "RPS":
                    #Start timer for RPS portion
                    total_time_start = time.time()

                    #Run RPS
                    sanitized_closed_IS = rps(model=copied_model,
                                              sensitiveItemsets=sensitive_IS,
                                              supportThreshold=sigma_min)

                elif algorithm == "MRPS":
                    #Convert to pandas format for MRPS input
                    sensitive_IS_pandas = pd.DataFrame(
                        data=[(sensitive_IS),
                              np.full((len(sensitive_IS)), sigma_min),
                              np.full((len(sensitive_IS)), sigma_min - 0.5 *
                                      (sigma_min - sigma_model))]).T

                    sensitive_IS_pandas.columns = [
                        'itemset', 'upper_threshold', 'lower_threshold'
                    ]

                    #Start timer for RPS portion
                    total_time_start = time.time()

                    #Run RPS random threshold
                    sanitized_closed_IS = rps_two_thresholds(
                        model=copied_model,
                        sensitiveItemsets=sensitive_IS_pandas)

                #Reproduce frequent itemsets
                sanitized_DB = itemsets_from_closed_itemsets(
                    closed_itemsets=sanitized_closed_IS,
                    possible_itemsets=freq_model['itemsets'])

                rps_time = time.time()

                #Calculating metrics
                #Variables needed
                freq_sanitized = sanitized_DB.loc[
                    sanitized_DB["support"] >= sigma_min]

                #Sensitive subsets of frequent itemsets
                freq_sanitized_sensitive = get_sensitive_subsets(
                    freq_sanitized, sensitive_IS)
                freq_original_sensitive = get_sensitive_subsets(
                    freq_original, sensitive_IS)

                #Non sensitive subset of frequent itemsets
                freq_sanitized_nonsensitive = remove_sensitive_subsets(
                    freq_sanitized, sensitive_IS)["itemsets"]
                freq_original_nonsensitive = remove_sensitive_subsets(
                    freq_original, sensitive_IS)["itemsets"]

                #Calculation of metrics
                hiding_f = hiding_failure(freq_original_sensitive["itemsets"],
                                          freq_sanitized_sensitive["itemsets"])
                artifactual_p = artifactual_patterns(
                    set(freq_original["itemsets"]),
                    set(freq_sanitized["itemsets"]))
                misses_c = misses_cost(freq_original_nonsensitive.copy(),
                                       freq_sanitized_nonsensitive.copy())
                side_effect_fac = side_effects_factor(
                    set(freq_original["itemsets"]),
                    set(freq_sanitized["itemsets"]),
                    set(freq_original_sensitive["itemsets"]))

                #Information loss between frequent itemsets in original and sanitized at sigma model
                information_l = information_loss(freq_model.copy(),
                                                 sanitized_DB)

                #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min
                expected_information_l = expected_information_loss(
                    freq_model.copy(), freq_original_sensitive.copy(),
                    sigma_min)

                #Calculate the end time of this iteration
                end_time = rps_time - total_time_start

                #Threshold sanitized database by threshold_min to get frequent itemsets
                print(f'- RPS time: {end_time}')

                #Plot support graphs
                dual_support_graph_distribution(
                    freq_model, sanitized_DB, sigma_model, dataset + "_" +
                    str(i) + "_" + str(sigma_min) + "_" + str(k_freq))

                #Find number of FI in sanitized database containing sensitive itemsets
                num_FI_containing_S_RPS = count_FI_containing_S(
                    freq_sanitized, sensitive_IS)

                #Add to row of table
                new_row = {
                    'Model': dataset,
                    'Model threshold': sigma_model,
                    'Support threshold': sigma_min,
                    'Sensitive itemsets': k_freq,
                    'Number of FI before sanitization': len(freq_original),
                    'Number of FI containing an element of S before sanitization':
                    num_FI_containing_S,
                    'Information loss expected': expected_information_l,
                    'Number of FI after sanitization': len(freq_sanitized),
                    'Number of FI containing an element of S after RPS':
                    num_FI_containing_S_RPS,
                    'Hiding failure': hiding_f,
                    'Artifactual patterns': artifactual_p,
                    'Misses cost': misses_c,
                    'Side effects factor': side_effect_fac,
                    'Information loss': information_l,
                    'RPS Time': end_time
                }

                #Update after each one just so we are sure we are recording results
                table_11 = table_11.append(new_row, ignore_index=True)
                table_11.to_csv('table_11_' + str(i) + '.csv')

Example #8

0

Show file

File: test_artifactual_patterns.py Project: jamesdu0504/760GroupProject

class TestArtifactualPatterns(unittest.TestCase):

    original_IS = None
    original_Closed_IS = None

    # Want to hide the sensitive itemsets below this threshold
    sigma_min = 0.3

    # Sensitive closed itemsets whose support needs to be reduced
    sensitive_IS = {frozenset(['1', '2']), frozenset(['4'])}

    # Get toy data, WARNING! Had to change relative reference for this to work
    basket_sets = im.import_dataset("toydata")

    @classmethod
    def setUpClass(cls):

        # Abuse FPGrowth using absolute smallest min support to get all itemsets as frequent itemsets
        sigma_model = 1 / len(cls.basket_sets)
        cls.original_IS = fpgrowth(cls.basket_sets,
                                   min_support=sigma_model,
                                   use_colnames=True,
                                   verbose=False)

        # Compute closed itemsets of original data base
        cls.original_Closed_IS, _ = get_closed_itemsets(
            cls.basket_sets, sigma_model)

        # Get frequent itemsets
        cls.original_Freq_IS = cls.original_IS[
            cls.original_IS["support"] >= cls.sigma_min]

    def test_artifactual_patterns_with_rps(self):

        # Produce a sanitised DB with sensitive IS's support below sigma_min
        sanitized_closed_IS = rps(model=self.original_Closed_IS,
                                  sensitiveItemsets=self.sensitive_IS,
                                  supportThreshold=self.sigma_min)

        # Convert from closed to frequent itemsets
        sanitised_F_IS = itemsets_from_closed_itemsets(
            closed_itemsets=sanitized_closed_IS,
            possible_itemsets=self.original_IS['itemsets'])

        # All itemsets in original database
        a = set(self.original_Freq_IS["itemsets"])

        # All itemsets in sanitised database
        b = set(sanitised_F_IS[sanitised_F_IS["support"] >= self.sigma_min]
                ["itemsets"])

        af = artifactual_patterns(a, b)
        self.assertEqual(af, 0.0)

    def test_artifactual_patterns_with_pgbs(self):

        # PGBS needs input in this format
        sensitive_IL = pd.DataFrame({
            'itemset': [list(l) for l in self.sensitive_IS],
            'threshold': [self.sigma_min, self.sigma_min]
        })

        original_database = self.basket_sets.copy()
        modified_database = self.basket_sets.copy()

        # No return value, instead it modifies input database in place
        pgbs(modified_database, sensitive_IL)

        # Get all itemsets and supports in D (original_database)
        sigma_model = 1 / len(original_database)
        original_IS = fpgrowth(original_database,
                               min_support=sigma_model,
                               use_colnames=True,
                               verbose=False)

        # Get all itemsets and supports in D' (modified_database)
        mofidied_F_IS = fpgrowth(modified_database,
                                 min_support=sigma_model,
                                 use_colnames=True,
                                 verbose=False)

        # All itemsets in original database
        a = set(original_IS["itemsets"])

        # All itemsets in sanitised database
        b = set(mofidied_F_IS["itemsets"])

        af = artifactual_patterns(a, b)
        self.assertEqual(af, 0.0)

Example #9

0

Show file

File: Association_rule_mining.py Project: jamesdu0504/760GroupProject

def main():
    min_support = 0.01  #Support threshold used
    min_confidence = 0.05  #Confidence threshold used

    print('========== Importing Dataset ==========')
    basket_sets = im.import_dataset(
        "toydata"
    )  #Insert any of the datasets listed above here to import them
    print('=======================================\n')

    # Gather all itemsets
    power_set_of_items = fpgrowth(basket_sets,
                                  min_support=(1 / len(basket_sets)),
                                  use_colnames=True)

    # Find frequent itemsets above support threshold min_support
    frequent_itemsets = fpgrowth(basket_sets,
                                 min_support=min_support,
                                 use_colnames=True)

    # Compute closed itemsets from database
    closed_itemsets, _ = get_closed_itemsets(basket_sets, 1 / len(basket_sets))

    # Recover the original itemsets from the list of closed itemsets
    recovered_itemsets = itemsets_from_closed_itemsets(
        closed_itemsets=closed_itemsets,
        possible_itemsets=power_set_of_items['itemsets'])

    assert recovered_itemsets.equals(power_set_of_items)

    # Sanitize database
    sanitized_closed_itemsets = rps(
        reference_model=closed_itemsets,
        sensitiveItemsets={frozenset(['1', '2']),
                           frozenset(['4'])},
        supportThreshold=0.3)
    sanitized_database = itemsets_from_closed_itemsets(
        closed_itemsets=sanitized_closed_itemsets,
        possible_itemsets=power_set_of_items['itemsets'])

    print('Raw Database:')
    print(power_set_of_items)
    print()
    print('Sanitized Database:')
    print(sanitized_database)
    print()
    print(f'Frequent Itemsets above min_sup {min_support}:')
    print(frequent_itemsets)
    print()

    # print(frequent_itemsets)
    if frequent_itemsets.shape[0] > 0:
        rules = association_rules(frequent_itemsets,
                                  metric="confidence",
                                  min_threshold=min_confidence)
        if rules.shape[0] > 0:
            print(rules[rules['confidence'] >= 0.0])
        else:
            print("Confidence too low, no rules were found")
    else:
        print("Support too low, no frequent item sets found")

Example #10

0

Show file

def main(datasets):
    #Create the base of a table
    table_11 = pd.DataFrame(columns=['Model',
                                     'Support threshold',
                                     'Model threshold',
                                     'Sensitive itemsets',
                                     'Number of FI before sanitization',
                                     'Information loss expected',
                                     'Number of FI after sanitization',
                                     'Number of FI containing an element of S after RPS',
                                     'Hiding failure',
                                     'Artifactual patterns',
                                     'Misses cost',
                                     'Side effects factor',
                                     'Information loss',
                                     'PGBS time'])

    #Loop through datasets
    for dataset in datasets:
        sigma_model = datasets[dataset][0]

        #Load dataset
        data = im.import_dataset(dataset)
        data = data.astype('bool') #This may be needed for some datasets
        print("\n", dataset, "imported\n")

        #Get frequent itemsets
        freq_model = fpgrowth(data, min_support=sigma_model, use_colnames=True) 

        #Loop through support thresholds
        for sigma_min in datasets[dataset][1:]:
            print("\n", dataset, "FI:", sigma_min)
            
            #Find original frequent itemsets at frequency sigma min
            freq_original = freq_model.loc[freq_model["support"] >= sigma_min]

            for k_freq in [10, 30, 50]:
                print("-", dataset, ":", k_freq, "Sensitive itemsets")

                #Copy the transactions so we can edit it directly
                copied_data = data.copy()
                
                #We pick sensitive itemsets here
                sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq)

                #Start timer for PGBS portion
                total_time_start = time.time()

                #Convert to pandas format for PGBS input
                sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS), np.full((len(sensitive_IS)), sigma_min)]).T

                sensitive_IS_pandas.columns = ['itemset', 'threshold']

                #Run PGBS
                print("Running PGBS")
                pgbs(copied_data,sensitive_IS_pandas)
                print("PGBS run")
                pgbs_time = time.time()

                sensitive_IS = convert_to_sets(sensitive_IS)
                
                print("FPGrowth")
                #Reproduce frequent itemsets
                freq_model_sanitized = fpgrowth(copied_data, min_support=sigma_model, use_colnames=True)
                
                #Calculating metrics
                #Variables needed
                freq_sanitized = freq_model_sanitized.loc[freq_model_sanitized["support"] >= sigma_min]

                #Sensitive subsets of frequent itemsets
                freq_sanitized_sensitive = get_sensitive_subsets(freq_sanitized, sensitive_IS)
                freq_original_sensitive = get_sensitive_subsets(freq_original, sensitive_IS)

                #Non sensitive subset of frequent itemsets
                freq_sanitized_nonsensitive = remove_sensitive_subsets(freq_sanitized, sensitive_IS)["itemsets"]
                freq_original_nonsensitive = remove_sensitive_subsets(freq_original, sensitive_IS)["itemsets"]

                #Calculation of metrics
                freq_original_sensitive.to_csv("original.csv")
                freq_sanitized_sensitive.to_csv("sanitized.csv")
                print("len:", len(freq_original_sensitive["itemsets"]), len(freq_sanitized_sensitive["itemsets"]))

                hiding_f = hiding_failure(freq_original_sensitive["itemsets"], freq_sanitized_sensitive["itemsets"])
                artifactual_p = artifactual_patterns(set(freq_original["itemsets"]), set(freq_sanitized["itemsets"]))
                misses_c = misses_cost(freq_original_nonsensitive.copy(), freq_sanitized_nonsensitive.copy())
                side_effect_fac = side_effects_factor(set(freq_original["itemsets"]), set(freq_sanitized["itemsets"]), set(freq_original_sensitive["itemsets"]))

                #Information loss between frequent itemsets in original and sanitized at sigma model
                information_l = information_loss(freq_model.copy(), freq_model_sanitized)

                #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min
                expected_information_l = expected_information_loss(freq_model.copy(), freq_original_sensitive.copy(), sigma_min)

                #Calculate the end time of this iteration
                end_time = pgbs_time - total_time_start

                #Threshold sanitized database by threshold_min to get frequent itemsets 
                print(f'- PGBS time: {end_time}')

                #Plot support graphs
                dual_support_graph_distribution(freq_model, freq_model_sanitized, sigma_model, dataset+"_PGBS_"+str(sigma_min)+"_"+str(k_freq))

                #Find number of FI in sanitized database containing sensitive itemsets
                num_FI_containing_S_RPS = count_FI_containing_S(freq_sanitized, sensitive_IS)

                #Add to row of table
                new_row = {'Model': dataset,
                           'Model threshold': sigma_model,
                           'Support threshold': sigma_min,
                           'Sensitive itemsets': k_freq,
                           'Number of FI before sanitization': len(freq_original),
                           'Information loss expected': expected_information_l,
                           'Number of FI after sanitization': len(freq_sanitized),
                           'Number of FI containing an element of S after RPS': num_FI_containing_S_RPS,
                           'Hiding failure': hiding_f,
                           'Artifactual patterns': artifactual_p,
                           'Misses cost': misses_c,
                           'Side effects factor': side_effect_fac,
                           'Information loss': information_l,
                           'PGBS time': end_time}

                #Update after each one just so we are sure we are recording results
                table_11 = table_11.append(new_row, ignore_index=True)
                table_11.to_csv('table_pgbs.csv')

Example #11

0

Show file

File: tests_SWA.py Project: jamesdu0504/760GroupProject

def main(datasets):
    #Create the base of a table
    table_11 = pd.DataFrame(columns=[
        'Model', 'Support threshold', 'Model threshold', 'Sensitive itemsets',
        'Number of FI before sanitization', 'Information loss expected',
        'Number of FI after sanitization',
        'Number of FI containing an element of S after SWA', 'Hiding failure',
        'Artifactual patterns', 'Misses cost', 'Side effects factor',
        'Information loss', 'SWA time'
    ])

    #Loop through datasets
    for dataset in datasets:
        #Loop through support thresholds #TODO: error running this in the normal way but
        #It is not much of a slowdown for SWA to have this here

        #Load dataset
        sigma_model = datasets[dataset][0]
        db = im.import_dataset(dataset)
        db = db.astype('bool')  #This may be needed for some datasets
        print("\n", dataset, "imported")

        #Get frequent itemsets
        freq_model = fpgrowth(db, min_support=sigma_model, use_colnames=True)

        for sigma_min in datasets[dataset][1:]:
            print("\n", dataset, "FI:", sigma_min)

            #Find original frequent itemsets at frequency sigma min
            freq_original = freq_model.loc[freq_model["support"] >= sigma_min]

            for k_freq in [10, 30, 50]:

                data = im.convert_to_transaction(db)

                print(dataset, ":", k_freq, "Sensitive itemsets")

                #We pick sensitive itemsets here
                sensitive_IS = get_top_k_sensitive_itemsets(
                    freq_original, k_freq)

                #Start timer for SWA portion
                total_time_start = time.time()

                #Convert to pandas format for SWA input
                sensitive_rules = get_disclosures(sensitive_IS, freq_model,
                                                  sigma_min)

                #Run SWA
                SWA(data, sensitive_rules, data.shape[0])
                swa_time = time.time()

                sensitive_IS = convert_to_sets(sensitive_IS)

                data = im.convert_to_matrix(data)

                #Reproduce frequent itemsets
                freq_model_sanitized = fpgrowth(data,
                                                min_support=sigma_model,
                                                use_colnames=True)

                #Calculating metrics
                #Variables needed
                freq_sanitized = freq_model_sanitized.loc[
                    freq_model_sanitized["support"] >= sigma_min]

                #Sensitive subsets of frequent itemsets
                freq_sanitized_sensitive = get_sensitive_subsets(
                    freq_sanitized, sensitive_IS)
                freq_original_sensitive = get_sensitive_subsets(
                    freq_original, sensitive_IS)

                #Non sensitive subset of frequent itemsets
                freq_sanitized_nonsensitive = remove_sensitive_subsets(
                    freq_sanitized, sensitive_IS)["itemsets"]
                freq_original_nonsensitive = remove_sensitive_subsets(
                    freq_original, sensitive_IS)["itemsets"]

                #Calculation of metrics
                freq_original_sensitive.to_csv("original.csv")
                freq_sanitized_sensitive.to_csv("sanitized.csv")
                print("- len:", len(freq_original_sensitive["itemsets"]),
                      len(freq_sanitized_sensitive["itemsets"]))

                hiding_f = hiding_failure(freq_original_sensitive["itemsets"],
                                          freq_sanitized_sensitive["itemsets"])
                artifactual_p = artifactual_patterns(
                    set(freq_original["itemsets"]),
                    set(freq_sanitized["itemsets"]))
                misses_c = misses_cost(freq_original_nonsensitive.copy(),
                                       freq_sanitized_nonsensitive.copy())
                side_effect_fac = side_effects_factor(
                    set(freq_original["itemsets"]),
                    set(freq_sanitized["itemsets"]),
                    set(freq_original_sensitive["itemsets"]))

                #Information loss between frequent itemsets in original and sanitized at sigma model
                information_l = information_loss(freq_model.copy(),
                                                 freq_model_sanitized)

                #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min
                expected_information_l = expected_information_loss(
                    freq_model.copy(), freq_original_sensitive.copy(),
                    sigma_min)

                #Calculate the end time of this iteration
                end_time = swa_time - total_time_start

                #Threshold sanitized database by threshold_min to get frequent itemsets
                print(f'- SWA time: {end_time}')

                #Plot support graphs
                dual_support_graph_distribution(
                    freq_model, freq_model_sanitized, sigma_model,
                    dataset + "_SWA_" + str(sigma_min) + "_" + str(k_freq))

                #Find number of FI in sanitized database containing sensitive itemsets
                num_FI_containing_S_RPS = count_FI_containing_S(
                    freq_sanitized, sensitive_IS)

                #Add to row of table
                new_row = {
                    'Model': dataset,
                    'Model threshold': sigma_model,
                    'Support threshold': sigma_min,
                    'Sensitive itemsets': k_freq,
                    'Number of FI before sanitization': len(freq_original),
                    'Information loss expected': expected_information_l,
                    'Number of FI after sanitization': len(freq_sanitized),
                    'Number of FI containing an element of S after SWA':
                    num_FI_containing_S_RPS,
                    'Hiding failure': hiding_f,
                    'Artifactual patterns': artifactual_p,
                    'Misses cost': misses_c,
                    'Side effects factor': side_effect_fac,
                    'Information loss': information_l,
                    'SWA time': end_time
                }

                #Update after each one just so we are sure we are recording results
                table_11 = table_11.append(new_row, ignore_index=True)
                table_11.to_csv('table_SWA.csv')

Example #12

0

Show file

File: Manual_thresholds_example.py Project: jamesdu0504/760GroupProject

def main(datasets):
    for dataset in datasets:
        sigma_model = datasets[dataset][0]
        sigma_min = datasets[dataset][1]
        k_freq = 10

        #Load dataset
        data = im.import_dataset(dataset)
        data = data.astype('bool')  #This may be needed for some datasets
        print("\n", dataset, "imported\n")

        #Convert to closed itemsets
        current_model, freq_model = get_closed_itemsets(data, sigma_model)
        freq_original = freq_model.loc[freq_model["support"] >= sigma_min]
        sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq)

        #Convert to pandas format for MRPS input
        # sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS),
        #                                           np.array([0.8, 0.79, 0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71]),
        #                                           np.array([0.795, 0.785, 0.775, 0.765, 0.755, 0.745, 0.735, 0.725, 0.715, 0.705])]).T

        sensitive_IS_pandas = pd.DataFrame(
            data=[(sensitive_IS),
                  np.array(
                      [0.8, 0.74, 0.8, 0.74, 0.8, 0.74, 0.8, 0.74, 0.8, 0.74]),
                  np.array([
                      0.78,
                      0.72,
                      0.78,
                      0.72,
                      0.78,
                      0.72,
                      0.78,
                      0.72,
                      0.78,
                      0.72,
                  ])]).T
        print(sensitive_IS_pandas)

        sensitive_IS_pandas.columns = [
            'itemset', 'upper_threshold', 'lower_threshold'
        ]

        #Run RPS random threshold
        sanitized_closed_IS = rps_two_thresholds(
            model=current_model, sensitiveItemsets=sensitive_IS_pandas)

        #Reproduce frequent itemsets
        sanitized_DB = itemsets_from_closed_itemsets(
            closed_itemsets=sanitized_closed_IS,
            possible_itemsets=freq_model['itemsets'])

        #Plot support graphs
        dual_support_graph_distribution(
            freq_model, sanitized_DB, sigma_model,
            dataset + "_presentation_10_bins_" + str(k_freq))

        for sensitive in sensitive_IS:
            print(
                sensitive, ":",
                sanitized_DB.loc[sanitized_DB['itemsets'] == sensitive]
                ["support"].values[0], ":", freq_model.loc[
                    freq_model['itemsets'] == sensitive]["support"].values[0])

        information_l = information_loss(freq_model.copy(), sanitized_DB)
        print(information_l)

Example #13

0

Show file

File: Individual_test_three.py Project: jamesdu0504/760GroupProject

def main(datasets, experiment):
    for dataset in datasets:
        sigma_model = datasets[dataset][0]
        sigma_min = datasets[dataset][1]
        k_freq = 10
        #Load dataset
        data = im.import_dataset(dataset)
        data = data.astype('bool')  #This may be needed for some datasets
        print("\n", dataset, "imported\n")

        #Convert to closed itemsets
        current_model, freq_model = get_closed_itemsets(data, sigma_model)
        freq_original = freq_model.loc[freq_model["support"] >= sigma_min]
        sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq)

        if experiment == "MuRPS-range":
            #Convert to pandas format for MRPS input
            sensitive_IS_pandas = pd.DataFrame(
                data=[(sensitive_IS),
                      np.array([
                          0.8, 0.79, 0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72,
                          0.71
                      ]),
                      np.array([
                          0.795, 0.785, 0.775, 0.765, 0.755, 0.745, 0.735,
                          0.725, 0.715, 0.705
                      ])]).T

        elif experiment == "MuRPS-set":
            #Convert to pandas format for MRPS input
            thresholds = [
                0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275,
                0.7175, 0.7075
            ]
            sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS),
                                                     np.array(thresholds),
                                                     np.array(thresholds)]).T

        elif experiment == "SWA-set":
            db = im.convert_to_transaction(data)
            thresholds = [
                0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275,
                0.7175, 0.7075
            ]
            #Convert to pandas format for SWA input
            sensitive_rules = get_disclosures(sensitive_IS, freq_model,
                                              thresholds)
            print(sensitive_rules)

            #Run SWA
            SWA(db, sensitive_rules, db.shape[0])

            #Convert to frequent itemsets
            sensitive_IS = convert_to_sets(sensitive_IS)
            data = im.convert_to_matrix(db)
            freq_model_sanitized = fpgrowth(data,
                                            min_support=sigma_model,
                                            use_colnames=True)
            freq_sanitized = freq_model_sanitized.loc[
                freq_model_sanitized["support"] >= sigma_min]

        elif experiment == "PGBS-set":
            thresholds = [
                0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275,
                0.7175, 0.7075
            ]
            sensitive_IS_pandas = pd.DataFrame(
                data=[(sensitive_IS),
                      np.full((len(sensitive_IS)), thresholds)]).T

            sensitive_IS_pandas.columns = ['itemset', 'threshold']

            #Run PGBS
            pgbs(data, sensitive_IS_pandas)

            #Convert to frequent itemsets
            sensitive_IS = convert_to_sets(sensitive_IS)
            freq_model_sanitized = fpgrowth(data,
                                            min_support=sigma_model,
                                            use_colnames=True)
            freq_sanitized = freq_model_sanitized.loc[
                freq_model_sanitized["support"] >= sigma_min]

        if experiment[0] == "M":
            sensitive_IS_pandas.columns = [
                'itemset', 'upper_threshold', 'lower_threshold'
            ]
            print(sensitive_IS_pandas)

            #Run RPS random threshold
            sanitized_closed_IS = rps_two_thresholds(
                model=current_model, sensitiveItemsets=sensitive_IS_pandas)

            #Reproduce frequent itemsets
            freq_model_sanitized = itemsets_from_closed_itemsets(
                closed_itemsets=sanitized_closed_IS,
                possible_itemsets=freq_model['itemsets'])

        #Plot support graphs
        dual_support_graph_distribution(
            freq_model, freq_model_sanitized, sigma_model,
            dataset + "_presentation_" + experiment + "_" + str(k_freq))

        #Calculate and print information loss
        information_l = information_loss(freq_model.copy(),
                                         freq_model_sanitized)
        print("Information loss:", information_l)