def train(self,sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounders = ["genus","family","order","class","phylum","superkingdom"] confounders = ["order"] cwmilibrary.P_Y = None for confounder in confounders: cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): if i % 100 == 0: print "Processed %d of %d"%(i,nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder) feature_scores["%s_mi"%(confounder)] = scores["mi"] feature_scores["%s_cmi"%(confounder)] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"] feature_scores["%s_cwmi"%(confounder)] = cwmi if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_label = self.get_best_class([key],sample_set) arlist.append(AssociationRule([key],[class_label],ardic[key])) association_rule_set.extend(arlist) association_rule_set = association_rule_set.remap_index_to_feature(sample_set) return association_rule_set
def train(self, samples, maxRuleSize=9999, mineOnlyClass=None): """Train with CPAR on the sample set, returning an AssociationRuleSet.""" self.MAX_RULE_SIZE = maxRuleSize self.objPNData = PNData(samples) self.lstRules = [] classes = self.objPNData.getClassList() log("Dataset has %d classes over %d samples." % (len(classes), len(samples))) for current_class in classes: if mineOnlyClass != None: if current_class != mineOnlyClass: continue log("Processing class %s" % (current_class)) self.objPNData.setCurrentClass(current_class) dblMinTotalWeight = self.dblTotalWeightFactor * self.objPNData.getTotalWeight( ) lstAntecedent = [] while self.objPNData.getTotalWeight() > dblMinTotalWeight: self.objPNData.refreshPNAData() if self.objPNData.noValidGainsinPNarray( self.dblMinGainThreshold): #log("NO VALID GAINS....Breaking!"); break #log('BEGIN DEPTH FIRST SEARCH - total weight %f > %f'%(self.objPNData.getTotalWeight(),dblMinTotalWeight)) self._CPARdfs(self.objPNData.copyPrimes(), lstAntecedent, [current_class]) trules = len(self.lstRules) self.removeDuplicateRules() #log("End of rule search. Found %d rules total, %d after duplicates removed."%(trules,len(self.lstRules))) arset = AssociationRuleSet() arset.extend(self.lstRules) arset.set_target_accuracy("laplace") return self.remap_index_to_feature(arset, samples)
def train(self,samples,maxRuleSize=9999,mineOnlyClass=None): """Train with CPAR on the sample set, returning an AssociationRuleSet.""" self.MAX_RULE_SIZE = maxRuleSize self.objPNData = PNData(samples) self.lstRules = [] classes = self.objPNData.getClassList() log("Dataset has %d classes over %d samples."%(len(classes),len(samples))) for current_class in classes: if mineOnlyClass != None: if current_class != mineOnlyClass: continue log("Processing class %s"%(current_class)) self.objPNData.setCurrentClass(current_class) dblMinTotalWeight = self.dblTotalWeightFactor * self.objPNData.getTotalWeight() lstAntecedent = [] while self.objPNData.getTotalWeight() > dblMinTotalWeight: self.objPNData.refreshPNAData() if self.objPNData.noValidGainsinPNarray(self.dblMinGainThreshold): #log("NO VALID GAINS....Breaking!"); break #log('BEGIN DEPTH FIRST SEARCH - total weight %f > %f'%(self.objPNData.getTotalWeight(),dblMinTotalWeight)) self._CPARdfs(self.objPNData.copyPrimes(),lstAntecedent,[current_class]) trules = len(self.lstRules) self.removeDuplicateRules() #log("End of rule search. Found %d rules total, %d after duplicates removed."%(trules,len(self.lstRules))) arset = AssociationRuleSet() arset.extend(self.lstRules) arset.set_target_accuracy("laplace") return self.remap_index_to_feature(arset,samples)
def train(self, sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounders = [ "genus", "family", "order", "class", "phylum", "superkingdom" ] confounders = ["order"] cwmilibrary.P_Y = None for confounder in confounders: cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): if i % 100 == 0: print "Processed %d of %d" % (i, nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores( sample_set, feature, confounder) feature_scores["%s_mi" % (confounder)] = scores["mi"] feature_scores["%s_cmi" % (confounder)] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float( scores["cmi"] * scores["mi"]) / scores["hygivenz"] feature_scores["%s_cwmi" % (confounder)] = cwmi if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_label = self.get_best_class([key], sample_set) arlist.append(AssociationRule([key], [class_label], ardic[key])) association_rule_set.extend(arlist) association_rule_set = association_rule_set.remap_index_to_feature( sample_set) return association_rule_set
sys.exit(1) if len(target_samples) ==0: print "Could not find samples!" sys.exit() samples_time = pt.stop() print "Loaded samples (%0.2fs)"%(samples_time) pt.start() rules = load_rules(options.model_filename) rules = rules.remap_feature_to_index(samples) training_time = pt.stop() newrules = [] for rule in rules: keep_rule = False for target_sample in target_samples: if target_sample.satisfies(rule.ls): keep_rule = True if keep_rule: newrules.append(rule) newruleset = AssociationRuleSet() newruleset.extend(newrules) newruleset = newruleset.remap_index_to_feature(samples) newruleset.write(filename=options.output_filename)
def select(self,sample_set): cwmilibrary = CWMILibrary() cwmilibrary.confounders = self.confounders ardic = {} nattributes = sample_set.get_number_of_features() confounder = self.confounder cwmilibrary.P_Y = None cwmilibrary.P_Y_Z = None cwmilibrary.P_Z = None for i in xrange(nattributes): #if i % 100 == 0: # print "Processed %d of %d"%(i,nattributes) feature = i feature_scores = {} scores = cwmilibrary._calculate_information_scores(sample_set,feature,confounder) feature_scores["mi"] = scores["mi"] feature_scores["cmi"] = scores["cmi"] cwmi = 0.0 if scores["hygivenz"] > 0: cwmi = float(scores["cmi"]*scores["mi"])/scores["hygivenz"] feature_scores["cwmi"] = cwmi #association_rule = AssociationRule([feature],["NULL"],feature_scores) if not ardic.has_key(feature): ardic[feature] = {} ardic[feature].update(feature_scores) #arlist[feature] = feature_scores association_rule_set = AssociationRuleSet() arlist = [] for key in ardic.keys(): class_labels = self.get_best_classes([key],sample_set) class_label = "NULL" if len(class_labels) >0: class_label = class_labels[0] arlist.append(AssociationRule([key],[class_label],ardic[key])) association_rule_set.extend(arlist) class_labels = self._get_class_labels(sample_set) association_rule_set = association_rule_set.remap_index_to_feature(sample_set) "Make several copies for each score, then switch back and forth between each until filled..." association_rule_sets = [] for score in self.scores: aset = deepcopy(association_rule_set) aset.set_target_accuracy(score) association_rule_sets.append(aset) used_features = {} features = [] features_to_select = self.features_per_class*len(self.scores)*len(class_labels) feature_class_counts = {} for score in self.scores: feature_class_counts[score] = {} for class_label in class_labels: feature_class_counts[score][class_label] = self.features_per_class print "Processing features (%d)"%(features_to_select) while features_to_select > 0: for score_index in xrange(len(self.scores)): score = self.scores[score_index] association_rule_set = association_rule_sets[score_index] "Pick the next rule for each class..." for class_label in class_labels: for rule_index in xrange(len(association_rule_set)): rule = association_rule_set[rule_index] if rule.rs[0] == class_label: if not used_features.has_key(rule.ls[0]): used_features[rule.ls[0]] = 1 feature_class_counts[score][rule.rs[0]] -= 1 features.append(rule.ls[0]) features_to_select-=1 break print "Finished processing for %s, found %d features"%(str(self.scores),len(features)) if len(features) != self.features_per_class*len(scores)*len(class_labels): print "ERROR! did not find enough features...%d insead of %d"%(len(features),self.features_per_class*len(scores)*len(class_labels)) return features
error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) pt.start() fileio = FileIO() samples = fileio.load_samples(options.input_samples_filename) samples_time = pt.stop() print "Loaded samples (%0.2fs)"%(samples_time) if options.feature_select: print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score) pt.start() from pica.AssociationRule import load_rules,AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) print "Finished feature selection (%0.2fs)"%(pt.stop()) classes = fileio.load_classes(options.input_classes_filename) samples.load_class_labels(classes) print samples.get_number_of_features() samples.set_current_class(options.target_class) pt.start() print "Compressing features...", samples = samples.compress_features() compression_time = pt.stop() print "\bfinished compression.(%0.2fs)"%(compression_time) samples.set_current_class(options.target_class)
if not options.output_filename: error("Please specify a file for the output with -o /path/to/result.file") errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() samples = fileio.load_samples(options.input_samples_filename) if options.feature_select: print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score) from pica.AssociationRule import load_rules,AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) classes = fileio.load_classes(options.input_classes_filename) samples.load_class_labels(classes) print "Sample set has %d features."%(samples.get_number_of_features()) samples.set_current_class(options.target_class) print "Parameters from %s"%(options.parameters) print "Compressing features...", samples = samples.compress_features() print "compressed to %d distinct features."%(samples.get_number_of_features()) samples.set_current_class(options.target_class) samples.hide_nulls(options.target_class) modulepath = "pica.trainers.%s"%(options.training_algorithm)
continue sample_id_list = [] class_label_counts = {} for sid in sets[sampleid]: class_label = samples.get_by_id(sid).get_class_label() if not class_label_counts.has_key(class_label): class_label_counts[class_label] = 0 class_label_counts[class_label] += 1 sample_id_list.append("%s(%s)" % (sid, class_label)) for class_label in sorted(class_label_counts.keys()): f.write("%s:%d " % (class_label, class_label_counts[class_label])) f.write("%s\n" % (",".join(sample_id_list))) sample_rules = [] arset = AssociationRuleSet() arset.extend(newsamples[sampleid]) arset = arset.remap_index_to_feature(samples) for rule in arset: sample_rules.append("((%s)>(%s)(%s))" % (",".join( rule.ls), ",".join(rule.rs), rule.attributes["laplace"])) f.write("\n".join(sample_rules)) f.write("\n\n") "Do aggregate analysis of broken down rules" finished_items = {} for rule in indexed_rules: for item in rule.ls: finished_items[item] = 1 items = sorted(finished_items.keys()) f.write("\n\nItems in rules\n\n") all_class_labels = {}
continue sample_id_list = [] class_label_counts = {} for sid in sets[sampleid]: class_label = samples.get_by_id(sid).get_class_label() if not class_label_counts.has_key(class_label): class_label_counts[class_label] = 0 class_label_counts[class_label] += 1 sample_id_list.append("%s(%s)"%(sid,class_label)) for class_label in sorted(class_label_counts.keys()): f.write("%s:%d "%(class_label,class_label_counts[class_label])) f.write("%s\n"%(",".join(sample_id_list))) sample_rules = [] arset = AssociationRuleSet() arset.extend(newsamples[sampleid]) arset = arset.remap_index_to_feature(samples) for rule in arset: sample_rules.append("((%s)>(%s)(%s))"%(",".join(rule.ls),",".join(rule.rs),rule.attributes["laplace"])) f.write("\n".join(sample_rules)) f.write("\n\n") "Do aggregate analysis of broken down rules" finished_items = {} for rule in indexed_rules: for item in rule.ls: finished_items[item] = 1 items = sorted(finished_items.keys()) f.write("\n\nItems in rules\n\n") all_class_labels = {} for sample in samples:
if (sample.id == target_sample_id): target_samples.append(sample) else: print "You must specify a target sample" sys.exit(1) if len(target_samples) == 0: print "Could not find samples!" sys.exit() samples_time = pt.stop() print "Loaded samples (%0.2fs)" % (samples_time) pt.start() rules = load_rules(options.model_filename) rules = rules.remap_feature_to_index(samples) training_time = pt.stop() newrules = [] for rule in rules: keep_rule = False for target_sample in target_samples: if target_sample.satisfies(rule.ls): keep_rule = True if keep_rule: newrules.append(rule) newruleset = AssociationRuleSet() newruleset.extend(newrules) newruleset = newruleset.remap_index_to_feature(samples) newruleset.write(filename=options.output_filename)