Exemple #1
0
	def train(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounders = ["genus","family","order","class","phylum","superkingdom"]
		confounders = ["order"]
		cwmilibrary.P_Y = None
		for confounder in confounders:
			cwmilibrary.P_Y_Z = None
			cwmilibrary.P_Z = None
			for i in xrange(nattributes):
				if i % 100 == 0:
					print "Processed %d of %d"%(i,nattributes)
				feature = i
				feature_scores = {}
				scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
				feature_scores["%s_mi"%(confounder)] = scores["mi"]
				feature_scores["%s_cmi"%(confounder)] = scores["cmi"]
				cwmi = 0.0
				if scores["hygivenz"] > 0:
					cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
				feature_scores["%s_cwmi"%(confounder)] = cwmi
				
				if not ardic.has_key(feature):
					ardic[feature] = {}
				ardic[feature].update(feature_scores)
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_label = self.get_best_class([key],sample_set)
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		return association_rule_set
Exemple #2
0
    def train(self, sample_set):
        cwmilibrary = CWMILibrary()
        cwmilibrary.confounders = self.confounders
        ardic = {}
        nattributes = sample_set.get_number_of_features()
        confounders = [
            "genus", "family", "order", "class", "phylum", "superkingdom"
        ]
        confounders = ["order"]
        cwmilibrary.P_Y = None
        for confounder in confounders:
            cwmilibrary.P_Y_Z = None
            cwmilibrary.P_Z = None
            for i in xrange(nattributes):
                if i % 100 == 0:
                    print "Processed %d of %d" % (i, nattributes)
                feature = i
                feature_scores = {}
                scores = cwmilibrary._calculate_information_scores(
                    sample_set, feature, confounder)
                feature_scores["%s_mi" % (confounder)] = scores["mi"]
                feature_scores["%s_cmi" % (confounder)] = scores["cmi"]
                cwmi = 0.0
                if scores["hygivenz"] > 0:
                    cwmi = float(
                        scores["cmi"] * scores["mi"]) / scores["hygivenz"]
                feature_scores["%s_cwmi" % (confounder)] = cwmi

                if not ardic.has_key(feature):
                    ardic[feature] = {}
                ardic[feature].update(feature_scores)
        association_rule_set = AssociationRuleSet()
        arlist = []
        for key in ardic.keys():
            class_label = self.get_best_class([key], sample_set)
            arlist.append(AssociationRule([key], [class_label], ardic[key]))
        association_rule_set.extend(arlist)
        association_rule_set = association_rule_set.remap_index_to_feature(
            sample_set)
        return association_rule_set
		sys.exit(1)
	
	if len(target_samples) ==0:
		print "Could not find samples!"
		sys.exit()
	samples_time = pt.stop()
	print "Loaded samples (%0.2fs)"%(samples_time)
	
	pt.start()
	
	rules = load_rules(options.model_filename)
	rules = rules.remap_feature_to_index(samples)
	training_time = pt.stop()
	newrules = []
	
	for rule in rules:
		keep_rule = False
		for target_sample in target_samples:
			if target_sample.satisfies(rule.ls):
				keep_rule = True
		if keep_rule:
			newrules.append(rule)
	newruleset = AssociationRuleSet()
	newruleset.extend(newrules)
	newruleset = newruleset.remap_index_to_feature(samples)
	newruleset.write(filename=options.output_filename)
	
	
	
	
	
Exemple #4
0
	def select(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounder = self.confounder
		cwmilibrary.P_Y = None
		cwmilibrary.P_Y_Z = None
		cwmilibrary.P_Z = None
		for i in xrange(nattributes):
			#if i % 100 == 0:
			#	print "Processed %d of %d"%(i,nattributes)
			feature = i
			feature_scores = {}
			scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
			feature_scores["mi"] = scores["mi"]
			feature_scores["cmi"] = scores["cmi"]
			cwmi = 0.0
			if scores["hygivenz"] > 0:
				cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
			feature_scores["cwmi"] = cwmi
			
			#association_rule = AssociationRule([feature],["NULL"],feature_scores)
			if not ardic.has_key(feature):
				ardic[feature] = {}
			ardic[feature].update(feature_scores)
		#arlist[feature] = feature_scores
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_labels = self.get_best_classes([key],sample_set)
			class_label = "NULL"
			if len(class_labels) >0:
				class_label = class_labels[0]
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		class_labels = self._get_class_labels(sample_set)
		
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		"Make several copies for each score, then switch back and forth between each until filled..."
		association_rule_sets = []
		for score in self.scores:
			aset = deepcopy(association_rule_set)
			aset.set_target_accuracy(score)
			association_rule_sets.append(aset)
		
		used_features = {}
		features = []
		features_to_select = self.features_per_class*len(self.scores)*len(class_labels)
		feature_class_counts = {}
		for score in self.scores:
			feature_class_counts[score] = {}
			for class_label in class_labels:
				feature_class_counts[score][class_label] = self.features_per_class
		print "Processing features (%d)"%(features_to_select)
		while features_to_select > 0:
			for score_index in xrange(len(self.scores)):
				score = self.scores[score_index]
				association_rule_set = association_rule_sets[score_index]
				"Pick the next rule for each class..."
				for class_label in class_labels:
					for rule_index in xrange(len(association_rule_set)):
						rule = association_rule_set[rule_index]
						if rule.rs[0] == class_label:
							if not used_features.has_key(rule.ls[0]):
								used_features[rule.ls[0]] = 1
								feature_class_counts[score][rule.rs[0]] -= 1
								features.append(rule.ls[0])
								features_to_select-=1
								break
		print "Finished processing for %s, found %d features"%(str(self.scores),len(features))
		if len(features) != self.features_per_class*len(scores)*len(class_labels):
			print "ERROR! did not find enough features...%d insead of %d"%(len(features),self.features_per_class*len(scores)*len(class_labels))
			
		return features
Exemple #5
0
        sample_id_list = []
        class_label_counts = {}
        for sid in sets[sampleid]:
            class_label = samples.get_by_id(sid).get_class_label()
            if not class_label_counts.has_key(class_label):
                class_label_counts[class_label] = 0
            class_label_counts[class_label] += 1
            sample_id_list.append("%s(%s)" % (sid, class_label))
        for class_label in sorted(class_label_counts.keys()):
            f.write("%s:%d  " % (class_label, class_label_counts[class_label]))
        f.write("%s\n" % (",".join(sample_id_list)))

        sample_rules = []
        arset = AssociationRuleSet()
        arset.extend(newsamples[sampleid])
        arset = arset.remap_index_to_feature(samples)
        for rule in arset:
            sample_rules.append("((%s)>(%s)(%s))" % (",".join(
                rule.ls), ",".join(rule.rs), rule.attributes["laplace"]))
        f.write("\n".join(sample_rules))
        f.write("\n\n")

    "Do aggregate analysis of broken down rules"
    finished_items = {}
    for rule in indexed_rules:
        for item in rule.ls:
            finished_items[item] = 1
    items = sorted(finished_items.keys())
    f.write("\n\nItems in rules\n\n")
    all_class_labels = {}
    for sample in samples:
		sample_id_list = []
		class_label_counts = {}
		for sid in sets[sampleid]:
			class_label = samples.get_by_id(sid).get_class_label()
			if not class_label_counts.has_key(class_label):
				class_label_counts[class_label] = 0
			class_label_counts[class_label] += 1
			sample_id_list.append("%s(%s)"%(sid,class_label))
		for class_label in sorted(class_label_counts.keys()):
			f.write("%s:%d  "%(class_label,class_label_counts[class_label]))
		f.write("%s\n"%(",".join(sample_id_list)))
		
		sample_rules = []
		arset = AssociationRuleSet()
		arset.extend(newsamples[sampleid])
		arset = arset.remap_index_to_feature(samples)
		for rule in arset:
			sample_rules.append("((%s)>(%s)(%s))"%(",".join(rule.ls),",".join(rule.rs),rule.attributes["laplace"]))
		f.write("\n".join(sample_rules))
		f.write("\n\n")
	
	"Do aggregate analysis of broken down rules"
	finished_items = {}
	for rule in indexed_rules:
		for item in rule.ls:
			finished_items[item] = 1
	items = sorted(finished_items.keys())
	f.write("\n\nItems in rules\n\n")
	all_class_labels = {}
	for sample in samples:
		all_class_labels[sample.get_class_label()] = 1
Exemple #7
0
                if (sample.id == target_sample_id):
                    target_samples.append(sample)
    else:
        print "You must specify a target sample"
        sys.exit(1)

    if len(target_samples) == 0:
        print "Could not find samples!"
        sys.exit()
    samples_time = pt.stop()
    print "Loaded samples (%0.2fs)" % (samples_time)

    pt.start()

    rules = load_rules(options.model_filename)
    rules = rules.remap_feature_to_index(samples)
    training_time = pt.stop()
    newrules = []

    for rule in rules:
        keep_rule = False
        for target_sample in target_samples:
            if target_sample.satisfies(rule.ls):
                keep_rule = True
        if keep_rule:
            newrules.append(rule)
    newruleset = AssociationRuleSet()
    newruleset.extend(newrules)
    newruleset = newruleset.remap_index_to_feature(samples)
    newruleset.write(filename=options.output_filename)