def test_predict_probability(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) cba.predict_probability(transactions_test)
def test_predict_probability_works(self): cba = CBA(algorithm="m1") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) probabilities = cba.predict_probability(transactions_test) matched_rules = cba.predict_matched_rules(transactions_test) for idx in range(len(probabilities)): self.assertEqual(probabilities[idx], matched_rules[idx].confidence)
def test_accuracy(self): expected_accuracy = 0.5 cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) transactions_test = TransactionDB.from_DataFrame(test_dataframe[:2]) cba.fit(transactions) accuracy = cba.rule_model_accuracy(transactions_test) self.assertAlmostEqual(accuracy, expected_accuracy, places=3)
def fit(self, rule_cutoff): dataframes = self._prepare_dataframes() scores = [] for dataframe_train, dataframe_test in dataframes: txns_train = TransactionDB.from_DataFrame(dataframe_train) rules = top_rules(txns_train.string_representation, appearance=txns_train.appeardict) cars = createCARs(rules)[:rule_cutoff] quant_dataframe_train = QuantitativeDataFrame(dataframe_train) quant_dataframe_test = QuantitativeDataFrame(dataframe_test) self.classifier.fit(quant_dataframe_train, cars, debug=self.debug) score = None if self.score_auc: score = self.classifier.score_auc(quant_dataframe_test) else: score = self.classifier.score(quant_dataframe_test) scores.append(score) return scores
def fit(self, quant_dataframe, cars=None, rule_cutoff=30, lambda_array=7 * [1], class_name=None, debug=False, algorithm="SLS"): self.quant_dataframe_train = quant_dataframe self._prepare(quant_dataframe, class_name) for class_, clf_dict in self.ids_classifiers.items(): print("training class:", class_) clf = clf_dict["clf"] quant_dataframe = clf_dict["quant_dataframe"] pandas_dataframe = quant_dataframe.dataframe txns = TransactionDB.from_DataFrame(pandas_dataframe) rules = top_rules(txns.string_representation, appearance=txns.appeardict) cars = createCARs(rules) cars.sort(reverse=True) clf.fit(quant_dataframe, cars[:rule_cutoff], lambda_array=lambda_array, debug=debug, algorithm=algorithm)
def mine_frequent_itemsets(self, pandas_df, minsup): txns_classless = TransactionDB.from_DataFrame(pandas_df.iloc[:, :-1]) frequent_itemsets = fim.apriori(txns_classless.string_representation, supp=minsup * 100, report="s") return frequent_itemsets
def test_fitting(self): cba = CBA() test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions)
def mine_CARs(df, rule_cutoff, sample=False): txns = TransactionDB.from_DataFrame(df) rules = top_rules(txns.string_representation, appearance=txns.appeardict) cars = createCARs(rules) cars_subset = cars[:rule_cutoff] if sample: cars_subset = random.sample(cars, rule_cutoff) return cars_subset
def test_target_class_works(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe, target="Gender") cba.fit(transactions) rules = cba.clf.rules rule0 = rules[0] self.assertEqual(rule0.consequent[0], "Gender")
def test_rule_class_label_works(self): cba = CBA(algorithm="m2") test_dataframe = pd.read_csv(dataset_file, sep=",") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions) rules = cba.clf.rules rule0 = rules[0] self.assertEqual(rule0.consequent[0], test_dataframe.columns.values[-1])
def test_inspect(self): cba = CBA() test_dataframe = pd.read_csv(dataset_file, sep=";") transactions = TransactionDB.from_DataFrame(test_dataframe) cba.fit(transactions) clf = cba.clf inspect_df = clf.inspect() self.assertEqual(type(inspect_df), pd.DataFrame) self.assertEqual(len(inspect_df), len(clf.rules) + 1) self.assertEqual(inspect_df["lhs"].iloc[-1], "{}")
def mine_CARs(df, rule_cutoff, sample=False, random_seed=None, **top_rules_kwargs): if random_seed: random.seed(random_seed) np.random.seed(random_seed) txns = TransactionDB.from_DataFrame(df) rules = top_rules(txns.string_representation, appearance=txns.appeardict, **top_rules_kwargs) cars = createCARs(rules) cars_subset = cars[:rule_cutoff] if sample: cars_subset = random.sample(cars, rule_cutoff) return cars_subset
benchmark_data = [] for dataset_name, rule_cutoff in dataset_rulenum.items(): print(dataset_name) train_files = get_dataset_files(train_path, dataset_name) test_files = get_dataset_files(test_path, dataset_name) for train_file, test_file in list(zip(train_files, test_files))[:]: dataset_path = os.path.join(train_path, train_file) dataset_test_path = os.path.join(test_path, test_file) df = pd.read_csv(dataset_path) quant_df = QuantitativeDataFrame(df) txns = TransactionDB.from_DataFrame(df) df_test = pd.read_csv(dataset_test_path) quant_df_test = QuantitativeDataFrame(df_test) txns_test = TransactionDB.from_DataFrame(df_test) def fmax(param_dict): print(param_dict) support, confidence = param_dict["support"] / 1000, param_dict[ "confidence"] / 1000 print(dict(support=support, confidence=confidence)) cba = CBA(support=support, confidence=confidence) cba.fit(txns)
import numpy as np from pyarc.qcba.data_structures import QuantitativeDataFrame import time from pyids.algorithms.ids_classifier import mine_CARs from pyids.algorithms.ids import IDS from pyarc.data_structures import TransactionDB from pyarc.algorithms import M1Algorithm #logging.basicConfig(level=logging.DEBUG) iris_file = "c:/code/python/machine_learning/assoc_rules/train/iris0.csv" df = pd.read_csv(iris_file) txns = TransactionDB.from_DataFrame(df) iris_benchmark = [] for i in range(10, 110, 10): rule_count = i rules = mine_CARs(df, rule_count) quant_df = QuantitativeDataFrame(df) cars = mine_CARs(df, rule_count) print(len(cars)) for algorithm in ["DLS", "SLS"]: times = []
dataset_files = [f"{dataset_name}0.csv" for dataset_name in datasets] dataset_path = "C:/code/python/machine_learning/assoc_rules/" dataset_path_train = os.path.join(dataset_path, "train") dataset_path_test = os.path.join(dataset_path, "../../../test") benchmark_list = [] for dataset_filename in dataset_files: print(dataset_filename) df_train = pd.read_csv(os.path.join(dataset_path_train, dataset_filename)) df_test = pd.read_csv(os.path.join(dataset_path_test, dataset_filename)) txns_train = TransactionDB.from_DataFrame(df_train) txns_test = TransactionDB.from_DataFrame(df_test) quant_df_train = QuantitativeDataFrame(df_train) quant_df_test = QuantitativeDataFrame(df_test) cba = CBA(support=0.1, confidence=0.1) cba.fit(txns_train) rules = cba.clf.rules ids_ruleset = IDSRuleSet.from_cba_rules(rules) ids = IDS() ids.clf = IDSClassifier(ids_ruleset.ruleset) ids.clf.default_class = cba.clf.default_class