Example #1
0
	def train(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounders = ["genus","family","order","class","phylum","superkingdom"]
		confounders = ["order"]
		cwmilibrary.P_Y = None
		for confounder in confounders:
			cwmilibrary.P_Y_Z = None
			cwmilibrary.P_Z = None
			for i in xrange(nattributes):
				if i % 100 == 0:
					print "Processed %d of %d"%(i,nattributes)
				feature = i
				feature_scores = {}
				scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
				feature_scores["%s_mi"%(confounder)] = scores["mi"]
				feature_scores["%s_cmi"%(confounder)] = scores["cmi"]
				cwmi = 0.0
				if scores["hygivenz"] > 0:
					cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
				feature_scores["%s_cwmi"%(confounder)] = cwmi
				
				if not ardic.has_key(feature):
					ardic[feature] = {}
				ardic[feature].update(feature_scores)
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_label = self.get_best_class([key],sample_set)
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		return association_rule_set
Example #2
0
	def train(self,samples,maxRuleSize=9999,mineOnlyClass=None):
		"""Train with CPAR on the sample set, returning an AssociationRuleSet."""
		self.MAX_RULE_SIZE = maxRuleSize
		self.objPNData = PNData(samples)
		self.lstRules = []
		classes = self.objPNData.getClassList()
		
		log("Dataset has %d classes over %d samples."%(len(classes),len(samples)))
		for current_class in classes:
			if mineOnlyClass != None:
				if current_class != mineOnlyClass:
					continue
			log("Processing class %s"%(current_class))
			self.objPNData.setCurrentClass(current_class)
			dblMinTotalWeight = self.dblTotalWeightFactor * self.objPNData.getTotalWeight()
			lstAntecedent = []
			while self.objPNData.getTotalWeight() > dblMinTotalWeight:
				self.objPNData.refreshPNAData()
				if self.objPNData.noValidGainsinPNarray(self.dblMinGainThreshold):
					#log("NO VALID GAINS....Breaking!"); 
					break
				#log('BEGIN DEPTH FIRST SEARCH - total weight %f > %f'%(self.objPNData.getTotalWeight(),dblMinTotalWeight))
				self._CPARdfs(self.objPNData.copyPrimes(),lstAntecedent,[current_class])
		trules = len(self.lstRules)
		self.removeDuplicateRules()
		#log("End of rule search. Found %d rules total, %d after duplicates removed."%(trules,len(self.lstRules)))
		arset = AssociationRuleSet()
		arset.extend(self.lstRules)
		arset.set_target_accuracy("laplace")
		return self.remap_index_to_feature(arset,samples)
Example #3
0
	def load_rules(self,filename,target_label="YES",merge_with_rules=None):
		"""Parse the NETCAR output into a new AssociationRuleSet."""
		data = open(filename).readlines()
		arset = None
		if merge_with_rules == None:
			arset = AssociationRuleSet()
			arset.rules = []
		else:
			arset = merge_with_rules
		header = [x.strip() for x in data[1][2:].split("\t")] # we implicitly assume the first field is the left side (comma sep), the second the right side (comma sep)
		arset.attributes = header[1:]
		for line in data[2:]:	# there should two header lines
			fields = [x.strip() for x in line.split()]
			items = tuple([self.index_to_feature[int(x.strip())] for x in fields[0].split(",")])
			label = target_label
			scores = [x.strip() for x in fields[1:]]
			attributes = {}
			for i in range(len(arset.attributes)):
				attributes[arset.attributes[i]] = scores[i]
			arset.rules.append(AssociationRule(items,[label],attributes))
		return arset
Example #4
0
    def train(self, sample_set):
        cwmilibrary = CWMILibrary()
        cwmilibrary.confounders = self.confounders
        ardic = {}
        nattributes = sample_set.get_number_of_features()
        confounders = [
            "genus", "family", "order", "class", "phylum", "superkingdom"
        ]
        confounders = ["order"]
        cwmilibrary.P_Y = None
        for confounder in confounders:
            cwmilibrary.P_Y_Z = None
            cwmilibrary.P_Z = None
            for i in xrange(nattributes):
                if i % 100 == 0:
                    print "Processed %d of %d" % (i, nattributes)
                feature = i
                feature_scores = {}
                scores = cwmilibrary._calculate_information_scores(
                    sample_set, feature, confounder)
                feature_scores["%s_mi" % (confounder)] = scores["mi"]
                feature_scores["%s_cmi" % (confounder)] = scores["cmi"]
                cwmi = 0.0
                if scores["hygivenz"] > 0:
                    cwmi = float(
                        scores["cmi"] * scores["mi"]) / scores["hygivenz"]
                feature_scores["%s_cwmi" % (confounder)] = cwmi

                if not ardic.has_key(feature):
                    ardic[feature] = {}
                ardic[feature].update(feature_scores)
        association_rule_set = AssociationRuleSet()
        arlist = []
        for key in ardic.keys():
            class_label = self.get_best_class([key], sample_set)
            arlist.append(AssociationRule([key], [class_label], ardic[key]))
        association_rule_set.extend(arlist)
        association_rule_set = association_rule_set.remap_index_to_feature(
            sample_set)
        return association_rule_set
Example #5
0
    def train(self, samples, maxRuleSize=9999, mineOnlyClass=None):
        """Train with CPAR on the sample set, returning an AssociationRuleSet."""
        self.MAX_RULE_SIZE = maxRuleSize
        self.objPNData = PNData(samples)
        self.lstRules = []
        classes = self.objPNData.getClassList()

        log("Dataset has %d classes over %d samples." %
            (len(classes), len(samples)))
        for current_class in classes:
            if mineOnlyClass != None:
                if current_class != mineOnlyClass:
                    continue
            log("Processing class %s" % (current_class))
            self.objPNData.setCurrentClass(current_class)
            dblMinTotalWeight = self.dblTotalWeightFactor * self.objPNData.getTotalWeight(
            )
            lstAntecedent = []
            while self.objPNData.getTotalWeight() > dblMinTotalWeight:
                self.objPNData.refreshPNAData()
                if self.objPNData.noValidGainsinPNarray(
                        self.dblMinGainThreshold):
                    #log("NO VALID GAINS....Breaking!");
                    break
                #log('BEGIN DEPTH FIRST SEARCH - total weight %f > %f'%(self.objPNData.getTotalWeight(),dblMinTotalWeight))
                self._CPARdfs(self.objPNData.copyPrimes(), lstAntecedent,
                              [current_class])
        trules = len(self.lstRules)
        self.removeDuplicateRules()
        #log("End of rule search. Found %d rules total, %d after duplicates removed."%(trules,len(self.lstRules)))
        arset = AssociationRuleSet()
        arset.extend(self.lstRules)
        arset.set_target_accuracy("laplace")
        return self.remap_index_to_feature(arset, samples)
Example #6
0
		sys.exit(1)
	
	if len(target_samples) ==0:
		print "Could not find samples!"
		sys.exit()
	samples_time = pt.stop()
	print "Loaded samples (%0.2fs)"%(samples_time)
	
	pt.start()
	
	rules = load_rules(options.model_filename)
	rules = rules.remap_feature_to_index(samples)
	training_time = pt.stop()
	newrules = []
	
	for rule in rules:
		keep_rule = False
		for target_sample in target_samples:
			if target_sample.satisfies(rule.ls):
				keep_rule = True
		if keep_rule:
			newrules.append(rule)
	newruleset = AssociationRuleSet()
	newruleset.extend(newrules)
	newruleset = newruleset.remap_index_to_feature(samples)
	newruleset.write(filename=options.output_filename)
	
	
	
	
	
Example #7
0
	def select(self,sample_set):
		cwmilibrary = CWMILibrary()
		cwmilibrary.confounders = self.confounders
		ardic = {}
		nattributes = sample_set.get_number_of_features()
		confounder = self.confounder
		cwmilibrary.P_Y = None
		cwmilibrary.P_Y_Z = None
		cwmilibrary.P_Z = None
		for i in xrange(nattributes):
			#if i % 100 == 0:
			#	print "Processed %d of %d"%(i,nattributes)
			feature = i
			feature_scores = {}
			scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder)
			feature_scores["mi"] = scores["mi"]
			feature_scores["cmi"] = scores["cmi"]
			cwmi = 0.0
			if scores["hygivenz"] > 0:
				cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"]
			feature_scores["cwmi"] = cwmi
			
			#association_rule = AssociationRule([feature],["NULL"],feature_scores)
			if not ardic.has_key(feature):
				ardic[feature] = {}
			ardic[feature].update(feature_scores)
		#arlist[feature] = feature_scores
		association_rule_set = AssociationRuleSet()
		arlist = []
		for key in ardic.keys():
			class_labels = self.get_best_classes([key],sample_set)
			class_label = "NULL"
			if len(class_labels) >0:
				class_label = class_labels[0]
			arlist.append(AssociationRule([key],[class_label],ardic[key]))
		association_rule_set.extend(arlist)
		class_labels = self._get_class_labels(sample_set)
		
		association_rule_set = association_rule_set.remap_index_to_feature(sample_set)
		"Make several copies for each score, then switch back and forth between each until filled..."
		association_rule_sets = []
		for score in self.scores:
			aset = deepcopy(association_rule_set)
			aset.set_target_accuracy(score)
			association_rule_sets.append(aset)
		
		used_features = {}
		features = []
		features_to_select = self.features_per_class*len(self.scores)*len(class_labels)
		feature_class_counts = {}
		for score in self.scores:
			feature_class_counts[score] = {}
			for class_label in class_labels:
				feature_class_counts[score][class_label] = self.features_per_class
		print "Processing features (%d)"%(features_to_select)
		while features_to_select > 0:
			for score_index in xrange(len(self.scores)):
				score = self.scores[score_index]
				association_rule_set = association_rule_sets[score_index]
				"Pick the next rule for each class..."
				for class_label in class_labels:
					for rule_index in xrange(len(association_rule_set)):
						rule = association_rule_set[rule_index]
						if rule.rs[0] == class_label:
							if not used_features.has_key(rule.ls[0]):
								used_features[rule.ls[0]] = 1
								feature_class_counts[score][rule.rs[0]] -= 1
								features.append(rule.ls[0])
								features_to_select-=1
								break
		print "Finished processing for %s, found %d features"%(str(self.scores),len(features))
		if len(features) != self.features_per_class*len(scores)*len(class_labels):
			print "ERROR! did not find enough features...%d insead of %d"%(len(features),self.features_per_class*len(scores)*len(class_labels))
			
		return features
Example #8
0
		error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"")
		errorCount += 1
	if errorCount > 0:
		error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0]))
		exit(1)
	
	pt.start()
	fileio = FileIO()
	samples = fileio.load_samples(options.input_samples_filename)
	samples_time = pt.stop()
	print "Loaded samples (%0.2fs)"%(samples_time)
	if options.feature_select:
		print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score)
		pt.start()
		from pica.AssociationRule import load_rules,AssociationRuleSet
		selected_rules = AssociationRuleSet()
		rules = load_rules(options.feature_select)
		rules.set_target_accuracy(options.feature_select_score)
		selected_rules.extend(rules[:options.feature_select_top_n])
		samples = samples.feature_select(selected_rules)
		print "Finished feature selection (%0.2fs)"%(pt.stop())
	classes = fileio.load_classes(options.input_classes_filename)
	samples.load_class_labels(classes)
	print samples.get_number_of_features()
	samples.set_current_class(options.target_class)
	
	pt.start()
	print "Compressing features...",
	samples = samples.compress_features()
	compression_time = pt.stop()
	print "\bfinished compression.(%0.2fs)"%(compression_time)
Example #9
0
	if not options.target_class:
		error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"")
		errorCount += 1
	if not options.output_filename:
		error("Please specify a file for the output with -o /path/to/result.file")
		errorCount += 1
	if errorCount > 0:
		error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0]))
		exit(1)
		
	fileio = FileIO()
	samples = fileio.load_samples(options.input_samples_filename)
	if options.feature_select:
		print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score)
		from pica.AssociationRule import load_rules,AssociationRuleSet
		selected_rules = AssociationRuleSet()
		rules = load_rules(options.feature_select)
		rules.set_target_accuracy(options.feature_select_score)
		selected_rules.extend(rules[:options.feature_select_top_n])
		samples = samples.feature_select(selected_rules)
	classes = fileio.load_classes(options.input_classes_filename)
	samples.load_class_labels(classes)
	print "Sample set has %d features."%(samples.get_number_of_features())
	samples.set_current_class(options.target_class)
	print "Parameters from %s"%(options.parameters)
	print "Compressing features...",
	samples = samples.compress_features()
	print "compressed to %d distinct features."%(samples.get_number_of_features())
	
	samples.set_current_class(options.target_class)
	samples.hide_nulls(options.target_class)
Example #10
0
        if len(sets[sampleid]) == 1 and sets[sampleid][0] != sampleid:
            continue
        sample_id_list = []
        class_label_counts = {}
        for sid in sets[sampleid]:
            class_label = samples.get_by_id(sid).get_class_label()
            if not class_label_counts.has_key(class_label):
                class_label_counts[class_label] = 0
            class_label_counts[class_label] += 1
            sample_id_list.append("%s(%s)" % (sid, class_label))
        for class_label in sorted(class_label_counts.keys()):
            f.write("%s:%d  " % (class_label, class_label_counts[class_label]))
        f.write("%s\n" % (",".join(sample_id_list)))

        sample_rules = []
        arset = AssociationRuleSet()
        arset.extend(newsamples[sampleid])
        arset = arset.remap_index_to_feature(samples)
        for rule in arset:
            sample_rules.append("((%s)>(%s)(%s))" % (",".join(
                rule.ls), ",".join(rule.rs), rule.attributes["laplace"]))
        f.write("\n".join(sample_rules))
        f.write("\n\n")

    "Do aggregate analysis of broken down rules"
    finished_items = {}
    for rule in indexed_rules:
        for item in rule.ls:
            finished_items[item] = 1
    items = sorted(finished_items.keys())
    f.write("\n\nItems in rules\n\n")
		if len(sets[sampleid]) == 1 and sets[sampleid][0] != sampleid:
			continue
		sample_id_list = []
		class_label_counts = {}
		for sid in sets[sampleid]:
			class_label = samples.get_by_id(sid).get_class_label()
			if not class_label_counts.has_key(class_label):
				class_label_counts[class_label] = 0
			class_label_counts[class_label] += 1
			sample_id_list.append("%s(%s)"%(sid,class_label))
		for class_label in sorted(class_label_counts.keys()):
			f.write("%s:%d  "%(class_label,class_label_counts[class_label]))
		f.write("%s\n"%(",".join(sample_id_list)))
		
		sample_rules = []
		arset = AssociationRuleSet()
		arset.extend(newsamples[sampleid])
		arset = arset.remap_index_to_feature(samples)
		for rule in arset:
			sample_rules.append("((%s)>(%s)(%s))"%(",".join(rule.ls),",".join(rule.rs),rule.attributes["laplace"]))
		f.write("\n".join(sample_rules))
		f.write("\n\n")
	
	"Do aggregate analysis of broken down rules"
	finished_items = {}
	for rule in indexed_rules:
		for item in rule.ls:
			finished_items[item] = 1
	items = sorted(finished_items.keys())
	f.write("\n\nItems in rules\n\n")
	all_class_labels = {}
Example #12
0
                if (sample.id == target_sample_id):
                    target_samples.append(sample)
    else:
        print "You must specify a target sample"
        sys.exit(1)

    if len(target_samples) == 0:
        print "Could not find samples!"
        sys.exit()
    samples_time = pt.stop()
    print "Loaded samples (%0.2fs)" % (samples_time)

    pt.start()

    rules = load_rules(options.model_filename)
    rules = rules.remap_feature_to_index(samples)
    training_time = pt.stop()
    newrules = []

    for rule in rules:
        keep_rule = False
        for target_sample in target_samples:
            if target_sample.satisfies(rule.ls):
                keep_rule = True
        if keep_rule:
            newrules.append(rule)
    newruleset = AssociationRuleSet()
    newruleset.extend(newrules)
    newruleset = newruleset.remap_index_to_feature(samples)
    newruleset.write(filename=options.output_filename)