def create_item(indexes, variable_name, min_val, max_val, description, number_operations): """ Creates a class of type Item from the values of a NumericAttribute. Parameters ---------- indexes : np.ndarray Array of indexes where the item is present in the training data. variable_name : str Name of the attribute/variable that this item is attached to. min_val : float Minimum value covered by this item. item > min_val. max_val : float Maximum value covered by this item. item < max_val. description : str Text describing the interval defined by the item. item < max_val = 1; min_val < item < max_val = 2. number_operations : int Number of logical operators used to define the interval. Returns ---------- Item : Item class object Item with the characteristics described by the arguments. """ bit_array = indexes2bitset(indexes) activation_function = partial(activation_numeric, attribute_name=variable_name, minval=min_val, maxval=max_val) return Item(bit_array, variable_name, description, number_operations, activation_function)
def run_FSSD_wrapper(dataset, attributes, class_attribute, types, depthmax): offset = 0 nb_attributes = len(attributes) timebudget = 3600 top_k = 1000 wanted_label = dataset[0]["class"] attributes = attributes[offset:offset + nb_attributes] types = types[offset:offset + nb_attributes] timespent = time() pattern_setoutput, pattern_union_info, top_k_returned, header_returned = \ find_top_k_subgroups_general_precall(dataset, attributes, types, class_attribute, \ wanted_label, top_k, 'fssd', False, timebudget, depthmax) timespent = time() - timespent # print (top_k_returned[-1]) range_attributes = [] for ia, a in enumerate(attributes): colvals = [row[a] for row in dataset] if types[ia] == "numeric": maxval = max(colvals) minval = min(colvals) range_attributes.append([minval, maxval]) elif types[ia] == "nominal": range_attributes.append(list(set(colvals))) elif types[ia] == "simple": range_attributes.append(list(set(colvals))) c_values = list(set([row["class"] for row in dataset])) count_cl = [0 for c in c_values] for row in dataset: for ic, c in enumerate(c_values): if row["class"] == c: count_cl[ic] += 1 subgroup_sets = [] items = [] rules_supp = [] nitems = [] for pat in pattern_setoutput: # items nitemsaux = 0 for ia, a in enumerate(attributes): # print(pat[0][ia]) print("pattern: " + str(set(pat[0][ia])) + " range: " + str(set(range_attributes[ia]))) if not set(pat[0][ia]) >= set(range_attributes[ia]): nitemsaux += 1 nitems.append(nitemsaux) subgroup_index = pat[1]["support_full"] aux_supp = [0 for c in c_values] for idx in subgroup_index: for ic, c in enumerate(c_values): if dataset[idx]["class"] == c: aux_supp[ic] += 1 rules_supp.append(aux_supp) subgroup_sets.append(indexes2bitset(subgroup_index)) return nitems, subgroup_sets, timespent
def create_items(self) -> Tuple[List[Item], Dict[int, int]]: """ Creates a list of items from the nominal atrribute. Makes a list of items using equality relationship with the categories. Example: x= blue_eyes could be the description of one of the items, for the NominalAttribute.name = "eye_colour". Returns ---------- List[Item] : List of Items A list of all items based on the possible categories (only with equality relationships, not logical ORs). """ self.cardinality_operator = {1: len(self.categories)} number_operators = 1 for category in self.categories: vector_category = np.where(self.values == category)[0] bit_array = indexes2bitset(vector_category) description = self.name + " = " + str(category) activation_function = partial(activation_nominal, attribute_name=self.name, category=category) self.items.append(Item(bit_array,self.name, description, number_operators,activation_function)) return self.items, self.cardinality_operator
def findbitsets(patterns4prediction, X, Y): indeces_subgroups = [[] for pattern in patterns4prediction] if isinstance(X, pd.DataFrame): X = X.values if isinstance(Y, pd.DataFrame): Y = Y.values # find indices for ix, x in enumerate(X): for nr in range(len(patterns4prediction)): decision = decision_pattern(patterns4prediction[nr], x) if decision: indeces_subgroups[nr].append(ix) # clean the empty ones indeces_subgroups = [indices for indices in indeces_subgroups if indices] # pass to bitsets bitsets_subgroups = [ indexes2bitset(indices) for indices in indeces_subgroups ] return bitsets_subgroups
def init_bitarrays_class( self, target_values ) -> Tuple[Dict[Any, np.ndarray], Dict[Any, np.ndarray]]: """ Initializes the bit array values for each category. Returns ---------- Dict[gmpy2.mpz] : A dictionary of the bitarray values. """ for namecol, colvals in target_values.iteritems(): self.bit_arrays_var_class[namecol] = dict() self.counts[namecol] = dict() self.prob_var_class[namecol] = dict() for icat, category in enumerate(self.categories[namecol]): category_indexes = np.where(colvals.values == category)[0] self.bit_arrays_var_class[namecol][category] = indexes2bitset( category_indexes) self.counts[namecol][category] = len(category_indexes) self.prob_var_class[namecol][category] = self.counts[namecol][ category] / target_values.shape[0] return self.bit_arrays_var_class, self.counts, self.prob_var_class
def run_DSSD_wrapper(algorithmname, beam_width, number_rules_SSD, datasetname, df, task, depthmax, attribute_names, number_targets): if algorithmname == "seq-cover": conf_file = read_csvfile( './otheralgorithms/DSSD/bin/tmp_sequential.conf') elif algorithmname == "DSSD": conf_file = read_csvfile( './otheralgorithms/DSSD/bin/tmp_dssd_diverse.conf') conf_file[12] = [ 'postSelect = ' + str(int(number_rules_SSD.loc[datasetname, "number_rules"])) ] elif algorithmname == "top-k": conf_file = read_csvfile('./otheralgorithms/DSSD/bin/tmp_topk.conf') conf_file[12] = [ 'postSelect = ' + str(int(number_rules_SSD.loc[datasetname, "number_rules"])) ] nrows = df.shape[0] if nrows < 2000 and task == "single-nominal": conf_file[14] = ['searchType = ' + "dfs"] else: conf_file[14] = ['searchType = ' + "beam"] else: raise Exception("Wrong aglorithm name") conf_file[19] = ['beamWidth = ' + str(int(beam_width))] conf_file[15] = ['maxDepth = ' + str(min(int(depthmax), 10))] if task == "multi-nominal" or task == "single-nominal": conf_file[23] = ['measure = WKL'] # conf_file[24] = ['WRAccMode = 1vsAll'] elif task == "multi-numeric" or task == "single-numeric": conf_file[23] = ['measure = meantest'] conf_file[24] = ['WRAccMode = 1vsAll'] else: raise Exception("Wrong task name") write_file_dssd(conf_file, './otheralgorithms/DSSD/bin/tmp.conf') # check if path exists if not os.path.exists('.//otheralgorithms//DSSD//xps//dssd'): os.makedirs('.//otheralgorithms//DSSD//xps//dssd') else: shutil.rmtree('.//otheralgorithms//DSSD//xps//dssd') os.makedirs('.//otheralgorithms//DSSD//xps//dssd') # change target variable file - target variables are at the end! name_targets = attribute_names[-number_targets:] targets_file = pd.read_csv( './otheralgorithms/DSSD/data/datasets/tmp/emmModel.emm', delimiter="=", header=None) targets_file.iloc[1, 1] = ' ' + ','.join( [tg_name for tg_name in name_targets]) targets_file.to_csv('./otheralgorithms/DSSD/data/datasets/tmp/tmp.emm', index=False, sep="=", header=False) # run DSSD timespent = time() os.chdir("./otheralgorithms/DSSD/bin") call(["emc64-mt-modified.exe"]) # call(["dssd64.exe"]) os.chdir("../../../") timespent = time() - timespent os.remove("./otheralgorithms/DSSD/data/datasets/tmp/tmp.arff") # read output files auxfiles = [ path for path in os.listdir('./otheralgorithms/DSSD/xps/dssd/') ] generated_xp = './otheralgorithms/DSSD/xps/dssd/' + auxfiles[ -1] # last one timestamp = generated_xp.split('-')[1] # find transaction ids of subgroups generated_xp_subsets_path = generated_xp + '/subsets' all_generated_subgroups_files = [ generated_xp_subsets_path + '/' + x for x in os.listdir(generated_xp_subsets_path) ] # find descriptions of subgroups if algorithmname == "top-k": description_files = generated_xp + '/' + "stats1-" + timestamp + ".csv" elif algorithmname == "seq-cover": description_files = generated_xp + '/' + "stats2-" + timestamp + ".csv" elif algorithmname == "DSSD": description_files = generated_xp + '/' + "stats3-" + timestamp + ".csv" # count number of items per subgroup descriptions = read_csvfile(description_files) #columnames, typevar, limits = info4prediction(df.iloc[:, :-number_targets], number_targets) #patterns4prediction = make_patterns4prediction(descriptions, columnames, typevar, limits) # Test dataset # nrows_test = Y_test.shape[0] # bitsets_subgroups = findbitsets(patterns4prediction,X_test,Y_test) nitems = [] for row in descriptions[1:]: # count items nitems.append(1 + row[0].count("&&")) subgroup_sets_support = [] subgroup_sets_support_bitset = [] support_union = set() nb_subgroups = 0 rules_supp = [] for subgroup_file in all_generated_subgroups_files: aux_subgroup = read_csvfile(subgroup_file)[2:] subgroup_biset = [row[0] for row in aux_subgroup] subgroup_index = set(i for i, x in enumerate(subgroup_biset) if x == '1') subgroup_sets_support.append(subgroup_index) subgroup_sets_support_bitset.append(indexes2bitset(subgroup_index)) support = len(subgroup_index) rules_supp.append(support) nb_subgroups += 1 return nitems, subgroup_sets_support_bitset, timespent
def run_CN2SD_wrapper(dataset, attributes, types, class_attribute, beam_width, depthmax, quality): wanted_label = dataset[0]["class"] # dataset,header=readCSVwithHeader(file,numberHeader=[a for a,t in zip(attributes,types) if t=='numeric'],delimiter=delimiter) new_dataset = deepcopy(dataset) new_dataset, positive_extent, negative_extent, alpha_ratio_class, _ = transform_dataset( dataset, attributes, class_attribute, wanted_label) new_dataset.insert( 0, { a: 'c' if t == 'numeric' else 'd' for a, t in list(zip(attributes, types)) + [('class', 'class')] }) new_dataset.insert( 1, {a: '' if a != 'class' else 'class' for a in attributes + ['class']}) writeCSVwithHeader(new_dataset, './otheralgorithms/tmpForOrange.csv', selectedHeader=attributes + ['class'], delimiter='\t', flagWriteHeader=True) data = Orange.data.Table('./otheralgorithms/tmpForOrange.csv') # print(data) timespent = time() # ordered! CN2SDLearner learner = Orange.classification.rules.CN2SDUnorderedLearner() if quality == 'entropy': learner.rule_finder.quality_evaluator = Orange.classification.rules.EntropyEvaluator( ) elif quality == 'wracc': learner.rule_finder.quality_evaluator = Orange.classification.rules.WeightedRelativeAccuracyEvaluator( ) # learner = Orange.classification.rules.CN2SDLearner() learner.gamma = 0. # learner.evaluator = "Evaluator_Entropy" learner.rule_finder.search_algorithm.beam_width = beam_width # continuous value space is constrained to reduce computation time learner.rule_finder.search_strategy.constrain_continuous = True # found rules must cover at least 15 examples learner.rule_finder.general_validator.min_covered_examples = max( int(15), 1.) # learner.rule_finder.general_validator.min_covered_examples = max(int(float(len(positive_extent))/10),1.) # found rules may combine at most 2 selectors (conditions) learner.rule_finder.general_validator.max_rule_length = depthmax classifier = learner(data) timespent = time() - timespent del classifier.rule_list[-1] top_quality = [] # import inspect # inspect.getmembers(learner, lambda a:not(inspect.isroutine(a))) # inspect.getmembers(row, lambda a:not(inspect.isroutine(a))) subgroup_sets = [] rules_supp = [] nitems = [] for i, row in enumerate(classifier.rule_list): s = str(row) nitems.append(1 + s.count("AND")) subgroup_biset = row.covered_examples subgroup_index = set(i for i, x in enumerate(subgroup_biset) if x == True) subgroup_sets.append(indexes2bitset(subgroup_index)) rules_supp.append(row.curr_class_dist.tolist()) return nitems, subgroup_sets, timespent