def load_confounders(self,confounder_filename): nfileio = FileIO() return nfileio.load_metadata(confounder_filename)
if __name__ == "__main__": pt = ProgramTimer() parser = OptionParser(version="%prog 0.8") parser.add_option("-s","--samples_filename",help="Read samples from FILE",metavar="FILE") parser.add_option("-m","--model_filename",help="Read rules from FILE",metavar="FILE") parser.add_option("-t","--target_sample",help="Set the target SAMPLE for selecting",metavar="SAMPLE") parser.add_option("-f","--target_samples_filename",help="Read target samples from filename, one per line") parser.add_option("-o","--output_filename",help="Write selected rules to FILE",metavar="FILE") parser.add_option("-g","--output_genomes",action="store_true",default=False,help="Output genomes that satisfy rules instead") (options, args) = parser.parse_args() pt.start() fileio = FileIO() samples = fileio.load_samples(options.samples_filename) target_samples = [] if options.target_sample: for sample in samples: if (sample.id == options.target_sample): target_samples.append(sample) elif options.target_samples_filename: target_sample_ids = [x.strip() for x in open(options.target_samples_filename).readlines()] for target_sample_id in target_sample_ids: for sample in samples: if (sample.id == target_sample_id): target_samples.append(sample) else: print "You must specify a target sample" sys.exit(1)
def crossvalidate(self): root_output = self.root_output if self.outputFilename != None: rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S") outputPath = self.outputFilename+"-Files" if not os.path.exists(outputPath): os.makedirs(outputPath) for replicate in xrange(self.r): if MPI_PARALLEL: pass else: replicate_plus_one = replicate+1 print "Starting replicate %d"%(replicate_plus_one) self.replicates.append([]) self._randomize_sample_set() partitions = self._split_sample_set(self.v) for i in xrange(self.v): self.replicates[replicate].append([]) training_set, test_set = self._construct_training_and_testing_sets(partitions,i) print "Fold %d: training_set: %d, test set: %d"%(i,len(training_set),len(test_set)) if ( self.target_class != None ): fileio = FileIO() trainingSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_training.set" testSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_test.set" print "Saving training set to: "+trainingSetFile fileio.save_samples(training_set,trainingSetFile) print "Saving test set to: "+testSetFile fileio.save_samples(test_set,testSetFile) for test_configuration in self.test_configurations: test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select(training_set) new_training_set = training_set.feature_select(features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train(new_training_set) if self.outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile+".classlabelmap",'a') as outfile: pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object with open(svmModelFile+".classlabelmapindex",'a') as outfile: pickle.dump(model["class_label_map_index"],outfile) with open(svmModelFile+".featuremapindex",'w') as outfile: pickle.dump(new_training_set.get_index_to_feature(), outfile) results = test_configuration.classifier.test(new_test_set,model) self.replicates[replicate][i].append(results) #order of results same as order of configurations if ( self.target_class != None ): print results.print_classification_log() print results if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features"%(locals()),"w") fout.write("\n".join(features)) fout.close() if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w") header_fields = ["sample","fold",self.sample_set.current_class] for test_configuration in self.test_configurations: header_fields.append(test_configuration.name) output_dictionary = {} output_lines = ["\t".join(header_fields)] for fold in xrange(self.v): for classification_index in xrange(len(self.replicates[replicate][fold][0].classifications_list)): main_sample_record = self.replicates[replicate][fold][0].classifications_list[classification_index] output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)] for test_configuration_index in xrange(len(self.test_configurations)): test_sample_record = self.replicates[replicate][fold][test_configuration_index].classifications_list[classification_index] output_line.append(str(test_sample_record.predicted_class)) output_lines.append("\t".join(output_line)) fout.write("\n".join(output_lines)) fout.close() print "Finished replicate %d"%(replicate_plus_one)
errorCount = 0 if not options.input_samples_filename: error("Please provide a genotype sample file with -s /path/to/genotype.file") errorCount += 1 if not options.input_classes_filename: error("Please provide a phenotype class file with -c /path/to/phenotype.file") errorCount += 1 if not options.target_class: error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"") errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) pt.start() fileio = FileIO() samples = fileio.load_samples(options.input_samples_filename) samples_time = pt.stop() print "Loaded samples (%0.2fs)"%(samples_time) if options.feature_select: print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score) pt.start() from pica.AssociationRule import load_rules,AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) print "Finished feature selection (%0.2fs)"%(pt.stop()) classes = fileio.load_classes(options.input_classes_filename) samples.load_class_labels(classes)
"""Get the mutual information shared between samples in file 1 with class labels in file 2 with taxonomy levels in file 3 and output to file 4""" import sys samples_filename = sys.argv[1] class_labels_filename = sys.argv[2] metadata_filename = sys.argv[3] output_filename = sys.argv[4] from pica.Sample import SampleSet, ClassLabelSet from pica.io.FileIO import FileIO from pica.IntegerMapping import IntegerMapping from pica.trainers.cwmi.CWMILibrary import CWMILibrary fileio = FileIO() cwmilibrary = CWMILibrary() metadata = fileio.load_metadata(metadata_filename) samples = fileio.load_samples(samples_filename) classes = fileio.load_classes(class_labels_filename) samples.load_class_labels(classes) confounders = metadata.get_key_list()[1:] outlines = [] header_line = ["phenotype"] header_line.extend(confounders) header_line.append("total") outlines.append("\t".join(header_line)) for class_name in classes.get_classes():
error("Please provide a genotype sample file with -s /path/to/genotype.file") errorCount += 1 if not options.input_classes_filename: error("Please provide a phenotype class file with -c /path/to/phenotype.file") errorCount += 1 if not options.target_class: error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"") errorCount += 1 if not options.output_filename: error("Please specify a file for the output with -o /path/to/result.file") errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() samples = fileio.load_samples(options.input_samples_filename) if options.feature_select: print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score) from pica.AssociationRule import load_rules,AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) classes = fileio.load_classes(options.input_classes_filename) samples.load_class_labels(classes) print "Sample set has %d features."%(samples.get_number_of_features()) samples.set_current_class(options.target_class) print "Parameters from %s"%(options.parameters) print "Compressing features...",
if not options.target_class: error( "Please provide the phenotype target to be predicted with -t \"TRAITNAME\"" ) errorCount += 1 if not options.output_filename: error( "Please specify a file for the output with -o /path/to/result.file" ) errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() unmodified_samples = fileio.load_samples(options.input_samples_filename) samples = fileio.load_samples(options.input_samples_filename) if options.feature_select: print "Selecting top %d features from %s, ordered by %s" % ( options.feature_select_top_n, options.feature_select, options.feature_select_score) from pica.AssociationRule import load_rules, AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) classes = fileio.load_classes(options.input_classes_filename) unmodified_samples.load_class_labels(classes) samples.load_class_labels(classes)
"--output_filename", help="Write selected organisms to FILE", metavar="FILE") parser.add_option("-c", "--classes_filename", help="Read class labels from FILE", metavar="FILE") parser.add_option("-t", "--target_class", help="Target class.", metavar="CLASS") (options, args) = parser.parse_args() pt.start() fileio = FileIO() samples = fileio.load_samples(options.samples_filename) classes = fileio.load_classes(options.classes_filename) samples.load_class_labels(classes) samples.set_current_class(options.target_class) target_samples = [] samples_time = pt.stop() print "Loaded samples (%0.2fs)" % (samples_time) pt.start() rules = load_rules(options.model_filename) indexed_rules = rules.remap_feature_to_index(samples) training_time = pt.stop() newsamples = {}
# Check arguments for crucial errors errorCount = 0 if not options.input_samples_filename: error("Please provide a genotype sample file with -s /path/to/genotype.file") errorCount += 1 if not options.model_filename: error("Please provide a model file for this phenotype with -m /path/to/model.file") errorCount += 1 if not options.target_class: error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"") errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() if options.algorithm == "libsvm.libSVMClassifier": # RVF: part of SVM fix (feature-index map) samples = fileio.load_samples(options.input_samples_filename, indexToAttribute=options.model_filename) else: #original code samples = fileio.load_samples(options.input_samples_filename) if options.input_classes_filename: classes = fileio.load_classes(options.input_classes_filename) else: classes = fileio.init_null_classes(options.input_samples_filename, options.target_class) #RVF """rules = load_rules(options.model_filename) #original code""" if options.algorithm == "libsvm.libSVMClassifier": m = svm_model(options.model_filename) with open(options.model_filename+".classlabelmap", 'rb') as handle: clm = pickle.loads(handle.read())
def crossvalidate(self): root_output = self.root_output if self.outputFilename != None: rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S") outputPath = self.outputFilename + "-Files" if not os.path.exists(outputPath): os.makedirs(outputPath) for replicate in xrange(self.r): if MPI_PARALLEL: pass else: replicate_plus_one = replicate + 1 print "Starting replicate %d" % (replicate_plus_one) self.replicates.append([]) self._randomize_sample_set() self.unmodified = self.unmodified._sort_by_sample_set( self.sample_set) partitions = self._split_sample_set(self.v) for i in xrange(self.v): self.replicates[replicate].append([]) training_set, test_set = self._construct_training_and_testing_sets( partitions, i) print "Fold %d: training_set: %d, test set: %d" % ( i, len(training_set), len(test_set)) if (self.target_class != None): fileio = FileIO() trainingSetFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str(i) + "_training.set" testSetFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str(i) + "_test.set" print "Saving training set to: " + trainingSetFile fileio.save_samples(training_set, trainingSetFile) print "Saving test set to: " + testSetFile fileio.save_samples(test_set, testSetFile) for test_configuration in self.test_configurations: test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select( training_set) new_training_set = training_set.feature_select( features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train( new_training_set) if self.outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str( i) + "_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile + ".classlabelmap", 'a') as outfile: pickle.dump( model["class_label_map"], outfile ) #fails with model, because of SWIGpy object with open(svmModelFile + ".classlabelmapindex", 'a') as outfile: pickle.dump(model["class_label_map_index"], outfile) with open(svmModelFile + ".featuremapindex", 'w') as outfile: pickle.dump( new_training_set.get_index_to_feature( ), outfile) ##################################################################################### # add here contamination&completeness ##################################################################################### all_class_labels = new_test_set.get_class_labels() sample_attribute_collection = {} for index in all_class_labels: sample_attribute_collection[index] = [] for sample in new_test_set.__iter__(): temp_attributes_list = list( sample.get_attributes_index_list()) sample_attribute_collection[ sample.current_class_label].append( temp_attributes_list) for w in range(0, len(self.completeness)): self.replicates[replicate][i].append([]) incomplete_test_set = new_test_set.induce_incompleteness( self.completeness[w]) if len(sample_attribute_collection.keys()) != 2: print(sample_attribute_collection.keys()) sys.stderr.write( "Warning: skipping contamination of Fold %i in replicate %i: need exactly 2 different class labels\n" % (i, replicate)) for z in range(0, len(self.contamination)): self.replicates[replicate][i][w].append([]) continue for z in range(0, len(self.contamination)): self.replicates[replicate][i][w].append([]) #print(completeness,contamination) contaminated_test_set = incomplete_test_set.introduce_contamination( sample_attribute_collection, self.contamination[z]) contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set( new_training_set) #print(dir(model)) results = test_configuration.classifier.test( contaminated_test_set, model) self.replicates[replicate][i][w][z].append( results ) #order of results same as order of configurations if (self.target_class != None): print results.print_classification_log() print results if root_output: fout = open( "%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features" % (locals()), "w") fout.write("\n".join(features)) fout.close() if root_output: fout = open( "%(root_output)s.r%(replicate_plus_one)d.classification.log" % (locals()), "w") header_fields = [ "sample", "fold", self.sample_set.current_class ] for test_configuration in self.test_configurations: header_fields.append(test_configuration.name) output_dictionary = {} output_lines = ["\t".join(header_fields)] for fold in xrange(self.v): for classification_index in xrange( len(self.replicates[replicate][fold] [0].classifications_list)): main_sample_record = self.replicates[replicate][ fold][0].classifications_list[ classification_index] output_line = [ str(main_sample_record.who), str(fold + 1), str(main_sample_record.true_class) ] for test_configuration_index in xrange( len(self.test_configurations)): test_sample_record = self.replicates[ replicate][fold][ test_configuration_index].classifications_list[ classification_index] output_line.append( str(test_sample_record.predicted_class)) output_lines.append("\t".join(output_line)) fout.write("\n".join(output_lines)) fout.close() print "Finished replicate %d" % (replicate_plus_one)
"""Get the mutual information shared between samples in file 1 with class labels in file 2 with taxonomy levels in file 3 and output to file 4""" import sys samples_filename = sys.argv[1] class_labels_filename = sys.argv[2] metadata_filename = sys.argv[3] output_filename = sys.argv[4] from pica.Sample import SampleSet, ClassLabelSet from pica.io.FileIO import FileIO from pica.IntegerMapping import IntegerMapping from pica.trainers.cwmi.CWMILibrary import CWMILibrary fileio = FileIO() cwmilibrary = CWMILibrary() metadata = fileio.load_metadata(metadata_filename) samples = fileio.load_samples(samples_filename) classes = fileio.load_classes(class_labels_filename) samples.load_class_labels(classes) confounders = metadata.get_key_list()[1:] outlines = [] header_line = ["phenotype"] header_line.extend(confounders) header_line.append("total") outlines.append("\t".join(header_line)) for class_name in classes.get_classes(): "generate phenotype map"
if not options.model_filename: error( "Please provide a model file for this phenotype with -m /path/to/model.file" ) errorCount += 1 if not options.target_class: error( "Please provide the phenotype target to be predicted with -t \"TRAITNAME\"" ) errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() if options.algorithm == "libsvm.libSVMClassifier": # RVF: part of SVM fix (feature-index map) samples = fileio.load_samples(options.input_samples_filename, indexToAttribute=options.model_filename) else: #original code samples = fileio.load_samples(options.input_samples_filename) if options.input_classes_filename: classes = fileio.load_classes(options.input_classes_filename) else: classes = fileio.init_null_classes(options.input_samples_filename, options.target_class) #RVF """rules = load_rules(options.model_filename) #original code""" if options.algorithm == "libsvm.libSVMClassifier": m = svm_model(options.model_filename)
if __name__ == "__main__": pt = ProgramTimer() parser = OptionParser(version="%prog 0.8") parser.add_option("-s","--samples_filename",help="Read samples from FILE",metavar="FILE") parser.add_option("-m","--model_filename",help="Read rules from FILE",metavar="FILE") parser.add_option("-o","--output_filename",help="Write selected organisms to FILE",metavar="FILE") parser.add_option("-c","--classes_filename",help="Read class labels from FILE",metavar="FILE") parser.add_option("-t","--target_class",help="Target class.",metavar="CLASS") (options, args) = parser.parse_args() pt.start() fileio = FileIO() samples = fileio.load_samples(options.samples_filename) classes = fileio.load_classes(options.classes_filename) samples.load_class_labels(classes) samples.set_current_class(options.target_class) target_samples = [] samples_time = pt.stop() print "Loaded samples (%0.2fs)"%(samples_time) pt.start() rules = load_rules(options.model_filename) indexed_rules = rules.remap_feature_to_index(samples) training_time = pt.stop() newsamples = {}
def replicateProcess(parametertuple): training_set, test_set, target_class, test_configurations, outputFilename, completeness, contamination, root_output, replicate, fold = parametertuple w_tot = len(completeness) z_tot = len(contamination) output=[] replicate_plus_one=replicate+1 print "Fold %d: training_set: %d, test set: %d"%(fold,len(training_set),len(test_set)) if ( target_class != None ): fileio = FileIO() trainingSetFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_training.set" testSetFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_test.set" print "Saving training set to: "+trainingSetFile fileio.save_samples(training_set,trainingSetFile) print "Saving test set to: "+testSetFile fileio.save_samples(test_set,testSetFile) for test_configuration_index in xrange(len(test_configurations)): test_configuration=test_configurations[test_configuration_index] test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select(training_set) new_training_set = training_set.feature_select(features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train(new_training_set) if outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile+".classlabelmap",'a') as outfile: pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object with open(svmModelFile+".classlabelmapindex",'a') as outfile: pickle.dump(model["class_label_map_index"],outfile) with open(svmModelFile+".featuremapindex",'w') as outfile: pickle.dump(new_training_set.get_index_to_feature(), outfile) all_class_labels=new_test_set.get_class_labels() sample_attribute_collection={} for index in all_class_labels: sample_attribute_collection[index]=[] for sample in new_test_set.__iter__(): temp_attributes_list=sample.get_attributes_index_list() sample_attribute_collection[sample.current_class_label].append(temp_attributes_list) for w in xrange(w_tot): output.append([]) incomplete_test_set = new_test_set.induce_incompleteness(completeness[w]) err=0 for z in xrange(z_tot): #output[w][z].append([]) if round(contamination[z],1) == 0.0: results = test_configuration.classifier.test(incomplete_test_set.map_test_set_attributes_to_training_set(new_training_set),model) summary = ClassificationSummary(results) output[w].append(summary) elif len(sample_attribute_collection.keys())==2: #do crosscontamination if exactly 2 class labels given contaminated_test_set = incomplete_test_set.introduce_contamination(sample_attribute_collection,contamination[z]) contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set(new_training_set) results = test_configuration.classifier.test(contaminated_test_set,model) summary = ClassificationSummary(results) output[w].append(summary) if ( target_class != None ): print results.print_classification_log() print results if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(fold)d.%(test_name)s.features"%(locals()),"w") fout.write("\n".join(features)) fout.close() elif err==0: sys.stderr.write("Warning: skipping contamination of fold %i of replicate %i: exactly 2 different class labels needed!"%(fold,replicate)) err=1 #print(replicates[replicate][fold][w][z]) # if root_output: # fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w") # header_fields = ["sample","fold",sample_set.current_class] # for test_configuration in test_configurations: # header_fields.append(test_configuration.name) # output_dictionary = {} # output_lines = ["\t".join(header_fields)] # for fold in xrange(v): # for classification_index in xrange(len(replicates[replicate][fold][0].classifications_list)): # main_sample_record = replicates[replicate][fold][0].classifications_list[classification_index] # output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)] # for test_configuration_index in xrange(len(test_configurations)): # test_sample_record = replicates[replicate][fold][test_configuration_index].classifications_list[classification_index] # output_line.append(str(test_sample_record.predicted_class)) # output_lines.append("\t".join(output_line)) # fout.write("\n".join(output_lines)) # fout.close() print "Finished replicate %d, fold %d"%(replicate_plus_one,fold) return output