Example #1
0
	def train(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounders = ["genus","family","order","class","phylum","superkingdom"]
		confounders = ["order"]
		cwmilibrary.P_Y = None
		for confounder in confounders:
			cwmilibrary.P_Y_Z = None
			cwmilibrary.P_Z = None
			for i in xrange(nattributes):
				if i % 100 == 0:
					print "Processed %d of %d"%(i,nattributes)
				feature = i
				feature_scores = {}
				scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
				feature_scores["%s_mi"%(confounder)] = scores["mi"]
				feature_scores["%s_cmi"%(confounder)] = scores["cmi"]
				cwmi = 0.0
				if scores["hygivenz"] > 0:
					cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
				feature_scores["%s_cwmi"%(confounder)] = cwmi
				
				if not ardic.has_key(feature):
					ardic[feature] = {}
				ardic[feature].update(feature_scores)
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_label = self.get_best_class([key],sample_set)
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		return association_rule_set
Example #2
0
    def train(self, sample_set):
        cwmilibrary = CWMILibrary()
        cwmilibrary.confounders = self.confounders
        ardic = {}
        nattributes = sample_set.get_number_of_features()
        confounders = [
            "genus", "family", "order", "class", "phylum", "superkingdom"
        ]
        confounders = ["order"]
        cwmilibrary.P_Y = None
        for confounder in confounders:
            cwmilibrary.P_Y_Z = None
            cwmilibrary.P_Z = None
            for i in xrange(nattributes):
                if i % 100 == 0:
                    print "Processed %d of %d" % (i, nattributes)
                feature = i
                feature_scores = {}
                scores = cwmilibrary._calculate_information_scores(
                    sample_set, feature, confounder)
                feature_scores["%s_mi" % (confounder)] = scores["mi"]
                feature_scores["%s_cmi" % (confounder)] = scores["cmi"]
                cwmi = 0.0
                if scores["hygivenz"] > 0:
                    cwmi = float(
                        scores["cmi"] * scores["mi"]) / scores["hygivenz"]
                feature_scores["%s_cwmi" % (confounder)] = cwmi

                if not ardic.has_key(feature):
                    ardic[feature] = {}
                ardic[feature].update(feature_scores)
        association_rule_set = AssociationRuleSet()
        arlist = []
        for key in ardic.keys():
            class_label = self.get_best_class([key], sample_set)
            arlist.append(AssociationRule([key], [class_label], ardic[key]))
        association_rule_set.extend(arlist)
        association_rule_set = association_rule_set.remap_index_to_feature(
            sample_set)
        return association_rule_set
Example #3
0
	def select(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounder = self.confounder
		cwmilibrary.P_Y = None
		cwmilibrary.P_Y_Z = None
		cwmilibrary.P_Z = None
		for i in xrange(nattributes):
			#if i % 100 == 0:
			#	print "Processed %d of %d"%(i,nattributes)
			feature = i
			feature_scores = {}
			scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
			feature_scores["mi"] = scores["mi"]
			feature_scores["cmi"] = scores["cmi"]
			cwmi = 0.0
			if scores["hygivenz"] > 0:
				cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
			feature_scores["cwmi"] = cwmi
			
			#association_rule = AssociationRule([feature],["NULL"],feature_scores)
			if not ardic.has_key(feature):
				ardic[feature] = {}
			ardic[feature].update(feature_scores)
		#arlist[feature] = feature_scores
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_labels = self.get_best_classes([key],sample_set)
			class_label = "NULL"
			if len(class_labels) >0:
				class_label = class_labels[0]
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		class_labels = self._get_class_labels(sample_set)
		
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		"Make several copies for each score, then switch back and forth between each until filled..."
		association_rule_sets = []
		for score in self.scores:
			aset = deepcopy(association_rule_set)
			aset.set_target_accuracy(score)
			association_rule_sets.append(aset)
		
		used_features = {}
		features = []
		features_to_select = self.features_per_class*len(self.scores)*len(class_labels)
		feature_class_counts = {}
		for score in self.scores:
			feature_class_counts[score] = {}
			for class_label in class_labels:
				feature_class_counts[score][class_label] = self.features_per_class
		print "Processing features (%d)"%(features_to_select)
		while features_to_select > 0:
			for score_index in xrange(len(self.scores)):
				score = self.scores[score_index]
				association_rule_set = association_rule_sets[score_index]
				"Pick the next rule for each class..."
				for class_label in class_labels:
					for rule_index in xrange(len(association_rule_set)):
						rule = association_rule_set[rule_index]
						if rule.rs[0] == class_label:
							if not used_features.has_key(rule.ls[0]):
								used_features[rule.ls[0]] = 1
								feature_class_counts[score][rule.rs[0]] -= 1
								features.append(rule.ls[0])
								features_to_select-=1
								break
		print "Finished processing for %s, found %d features"%(str(self.scores),len(features))
		if len(features) != self.features_per_class*len(scores)*len(class_labels):
			print "ERROR! did not find enough features...%d insead of %d"%(len(features),self.features_per_class*len(scores)*len(class_labels))
			
		return features