def train(self,sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounders = ["genus","family","order","class","phylum","superkingdom"] confounders = ["order"] cwmilibrary.P_Y = None for confounder in confounders: cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): if i % 100 == 0: print "Processed %d of %d"%(i,nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder) feature_scores["%s_mi"%(confounder)] = scores["mi"] feature_scores["%s_cmi"%(confounder)] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"] feature_scores["%s_cwmi"%(confounder)] = cwmi if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_label = self.get_best_class([key],sample_set) arlist.append(AssociationRule([key],[class_label],ardic[key])) association_rule_set.extend(arlist) association_rule_set = association_rule_set.remap_index_to_feature(sample_set) return association_rule_set
def train(self, sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounders = [ "genus", "family", "order", "class", "phylum", "superkingdom" ] confounders = ["order"] cwmilibrary.P_Y = None for confounder in confounders: cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): if i % 100 == 0: print "Processed %d of %d" % (i, nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores( sample_set, feature, confounder) feature_scores["%s_mi" % (confounder)] = scores["mi"] feature_scores["%s_cmi" % (confounder)] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float( scores["cmi"] * scores["mi"]) / scores["hygivenz"] feature_scores["%s_cwmi" % (confounder)] = cwmi if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_label = self.get_best_class([key], sample_set) arlist.append(AssociationRule([key], [class_label], ardic[key])) association_rule_set.extend(arlist) association_rule_set = association_rule_set.remap_index_to_feature( sample_set) return association_rule_set
def select(self,sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounder = self.confounder cwmilibrary.P_Y = None cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): #if i % 100 == 0: # print "Processed %d of %d"%(i,nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder) feature_scores["mi"] = scores["mi"] feature_scores["cmi"] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"] feature_scores["cwmi"] = cwmi #association_rule = AssociationRule([feature],["NULL"],feature_scores) if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) #arlist[feature] = feature_scores association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_labels = self.get_best_classes([key],sample_set) class_label = "NULL" if len(class_labels) >0: class_label = class_labels[0] arlist.append(AssociationRule([key],[class_label],ardic[key])) association_rule_set.extend(arlist) class_labels = self._get_class_labels(sample_set) association_rule_set = association_rule_set.remap_index_to_feature(sample_set) "Make several copies for each score, then switch back and forth between each until filled..." association_rule_sets = [] for score in self.scores: aset = deepcopy(association_rule_set) aset.set_target_accuracy(score) association_rule_sets.append(aset) used_features = {} features = [] features_to_select = self.features_per_class*len(self.scores)*len(class_labels) feature_class_counts = {} for score in self.scores: feature_class_counts[score] = {} for class_label in class_labels: feature_class_counts[score][class_label] = self.features_per_class print "Processing features (%d)"%(features_to_select) while features_to_select > 0: for score_index in xrange(len(self.scores)): score = self.scores[score_index] association_rule_set = association_rule_sets[score_index] "Pick the next rule for each class..." for class_label in class_labels: for rule_index in xrange(len(association_rule_set)): rule = association_rule_set[rule_index] if rule.rs[0] == class_label: if not used_features.has_key(rule.ls[0]): used_features[rule.ls[0]] = 1 feature_class_counts[score][rule.rs[0]] -= 1 features.append(rule.ls[0]) features_to_select-=1 break print "Finished processing for %s, found %d features"%(str(self.scores),len(features)) if len(features) != self.features_per_class*len(scores)*len(class_labels): print "ERROR! did not find enough features...%d insead of %d"%(len(features),self.features_per_class*len(scores)*len(class_labels)) return features