Ejemplo n.º 1
0
	def load_confounders(self,confounder_filename):
		nfileio = FileIO()
		return nfileio.load_metadata(confounder_filename)
Ejemplo n.º 2
0
if __name__ == "__main__":
	pt = ProgramTimer()
	parser = OptionParser(version="%prog 0.8")
	parser.add_option("-s","--samples_filename",help="Read samples from FILE",metavar="FILE")
	parser.add_option("-m","--model_filename",help="Read rules from FILE",metavar="FILE")
	parser.add_option("-t","--target_sample",help="Set the target SAMPLE for selecting",metavar="SAMPLE")
	parser.add_option("-f","--target_samples_filename",help="Read target samples from filename, one per line")
	parser.add_option("-o","--output_filename",help="Write selected rules to FILE",metavar="FILE")
	parser.add_option("-g","--output_genomes",action="store_true",default=False,help="Output genomes that satisfy rules instead")
	
	(options, args) = parser.parse_args()

	
	pt.start()
	fileio = FileIO()
	samples = fileio.load_samples(options.samples_filename)
	target_samples = []
	if options.target_sample:
		for sample in samples:
			if (sample.id == options.target_sample):
				target_samples.append(sample)
	elif options.target_samples_filename:
		target_sample_ids = [x.strip() for x in open(options.target_samples_filename).readlines()]
		for target_sample_id in target_sample_ids:
			for sample in samples:
				if (sample.id == target_sample_id):
					target_samples.append(sample)
	else:
		print "You must specify a target sample"
		sys.exit(1)
Ejemplo n.º 3
0
	def crossvalidate(self):
		root_output = self.root_output
		
		if self.outputFilename != None:
			rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S")
			outputPath = self.outputFilename+"-Files"
			if not os.path.exists(outputPath):
				os.makedirs(outputPath)
		

		for replicate in xrange(self.r):
			if MPI_PARALLEL:
				pass
			else:
				replicate_plus_one = replicate+1
				print "Starting replicate %d"%(replicate_plus_one)
				self.replicates.append([])
				self._randomize_sample_set()
				partitions = self._split_sample_set(self.v)
				for i in xrange(self.v):
					self.replicates[replicate].append([])
					training_set, test_set = self._construct_training_and_testing_sets(partitions,i)
					print "Fold %d: training_set: %d, test set: %d"%(i,len(training_set),len(test_set))
					
					if ( self.target_class != None ):
						fileio = FileIO()
						trainingSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_training.set"
						testSetFile =     outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_test.set"
						print "Saving training set to: "+trainingSetFile
						fileio.save_samples(training_set,trainingSetFile)
						print "Saving test set to: "+testSetFile
						fileio.save_samples(test_set,testSetFile)
		
					for test_configuration in self.test_configurations:
						test_name = test_configuration.name
						new_training_set = training_set
						new_test_set = test_set
						if test_configuration.feature_selector:
							features = test_configuration.feature_selector.select(training_set)
							new_training_set = training_set.feature_select(features)
							new_test_set = test_set.feature_select(features)
						model = test_configuration.trainer.train(new_training_set)
						
						if self.outputFilename != None:
							if not hasattr(model, 'write'): # i.e. probably SVM model
								svmModelFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_svm.model"							
								model['svm_model'].save(filename=svmModelFile)
								with open(svmModelFile+".classlabelmap",'a') as outfile:
									pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object
								with open(svmModelFile+".classlabelmapindex",'a') as outfile:
									pickle.dump(model["class_label_map_index"],outfile)
								with open(svmModelFile+".featuremapindex",'w') as outfile:
									pickle.dump(new_training_set.get_index_to_feature(), outfile)
						
						results = test_configuration.classifier.test(new_test_set,model)
						self.replicates[replicate][i].append(results) #order of results same as order of configurations
						
						if ( self.target_class != None ):
							print results.print_classification_log()
							print results
						
						if root_output:
							fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features"%(locals()),"w")
							fout.write("\n".join(features))
							fout.close()
				if root_output:
					
					fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w")
					header_fields = ["sample","fold",self.sample_set.current_class]
					for test_configuration in self.test_configurations:
						header_fields.append(test_configuration.name)
					output_dictionary = {}
					output_lines = ["\t".join(header_fields)]	
					for fold in xrange(self.v):
						for classification_index in xrange(len(self.replicates[replicate][fold][0].classifications_list)):
							main_sample_record = self.replicates[replicate][fold][0].classifications_list[classification_index]
							output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)]
							for test_configuration_index in xrange(len(self.test_configurations)):
								test_sample_record = self.replicates[replicate][fold][test_configuration_index].classifications_list[classification_index]
								output_line.append(str(test_sample_record.predicted_class))
							output_lines.append("\t".join(output_line))
					fout.write("\n".join(output_lines))
					fout.close()
				print "Finished replicate %d"%(replicate_plus_one)
Ejemplo n.º 4
0
	errorCount = 0
	if not options.input_samples_filename:
		error("Please provide a genotype sample file with -s /path/to/genotype.file")
		errorCount += 1
	if not options.input_classes_filename:
		error("Please provide a phenotype class file with -c /path/to/phenotype.file")
		errorCount += 1
	if not options.target_class:
		error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"")
		errorCount += 1
	if errorCount > 0:
		error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0]))
		exit(1)
	
	pt.start()
	fileio = FileIO()
	samples = fileio.load_samples(options.input_samples_filename)
	samples_time = pt.stop()
	print "Loaded samples (%0.2fs)"%(samples_time)
	if options.feature_select:
		print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score)
		pt.start()
		from pica.AssociationRule import load_rules,AssociationRuleSet
		selected_rules = AssociationRuleSet()
		rules = load_rules(options.feature_select)
		rules.set_target_accuracy(options.feature_select_score)
		selected_rules.extend(rules[:options.feature_select_top_n])
		samples = samples.feature_select(selected_rules)
		print "Finished feature selection (%0.2fs)"%(pt.stop())
	classes = fileio.load_classes(options.input_classes_filename)
	samples.load_class_labels(classes)
Ejemplo n.º 5
0
"""Get the mutual information shared between samples in file 1 with class labels in file 2 with taxonomy levels in file 3 and output to file 4"""

import sys


samples_filename = sys.argv[1]
class_labels_filename = sys.argv[2]
metadata_filename = sys.argv[3]
output_filename = sys.argv[4]

from pica.Sample import SampleSet, ClassLabelSet
from pica.io.FileIO import FileIO
from pica.IntegerMapping import IntegerMapping
from pica.trainers.cwmi.CWMILibrary import CWMILibrary

fileio = FileIO()
cwmilibrary = CWMILibrary()
metadata = fileio.load_metadata(metadata_filename)
samples = fileio.load_samples(samples_filename)
classes = fileio.load_classes(class_labels_filename)
samples.load_class_labels(classes)
confounders = metadata.get_key_list()[1:]

outlines = []
header_line = ["phenotype"]

header_line.extend(confounders)
header_line.append("total")
outlines.append("\t".join(header_line))

for class_name in classes.get_classes():
Ejemplo n.º 6
0
		error("Please provide a genotype sample file with -s /path/to/genotype.file")
		errorCount += 1
	if not options.input_classes_filename:
		error("Please provide a phenotype class file with -c /path/to/phenotype.file")
		errorCount += 1
	if not options.target_class:
		error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"")
		errorCount += 1
	if not options.output_filename:
		error("Please specify a file for the output with -o /path/to/result.file")
		errorCount += 1
	if errorCount > 0:
		error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0]))
		exit(1)
		
	fileio = FileIO()
	samples = fileio.load_samples(options.input_samples_filename)
	if options.feature_select:
		print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score)
		from pica.AssociationRule import load_rules,AssociationRuleSet
		selected_rules = AssociationRuleSet()
		rules = load_rules(options.feature_select)
		rules.set_target_accuracy(options.feature_select_score)
		selected_rules.extend(rules[:options.feature_select_top_n])
		samples = samples.feature_select(selected_rules)
	classes = fileio.load_classes(options.input_classes_filename)
	samples.load_class_labels(classes)
	print "Sample set has %d features."%(samples.get_number_of_features())
	samples.set_current_class(options.target_class)
	print "Parameters from %s"%(options.parameters)
	print "Compressing features...",
Ejemplo n.º 7
0
    if not options.target_class:
        error(
            "Please provide the phenotype target to be predicted with -t \"TRAITNAME\""
        )
        errorCount += 1
    if not options.output_filename:
        error(
            "Please specify a file for the output with -o /path/to/result.file"
        )
        errorCount += 1
    if errorCount > 0:
        error("For help on usage, try calling:\n\tpython %s -h" %
              os.path.basename(sys.argv[0]))
        exit(1)

    fileio = FileIO()
    unmodified_samples = fileio.load_samples(options.input_samples_filename)
    samples = fileio.load_samples(options.input_samples_filename)
    if options.feature_select:
        print "Selecting top %d features from %s, ordered by %s" % (
            options.feature_select_top_n, options.feature_select,
            options.feature_select_score)
        from pica.AssociationRule import load_rules, AssociationRuleSet
        selected_rules = AssociationRuleSet()
        rules = load_rules(options.feature_select)
        rules.set_target_accuracy(options.feature_select_score)
        selected_rules.extend(rules[:options.feature_select_top_n])
        samples = samples.feature_select(selected_rules)
    classes = fileio.load_classes(options.input_classes_filename)
    unmodified_samples.load_class_labels(classes)
    samples.load_class_labels(classes)
Ejemplo n.º 8
0
                      "--output_filename",
                      help="Write selected organisms to FILE",
                      metavar="FILE")
    parser.add_option("-c",
                      "--classes_filename",
                      help="Read class labels from FILE",
                      metavar="FILE")
    parser.add_option("-t",
                      "--target_class",
                      help="Target class.",
                      metavar="CLASS")

    (options, args) = parser.parse_args()

    pt.start()
    fileio = FileIO()
    samples = fileio.load_samples(options.samples_filename)
    classes = fileio.load_classes(options.classes_filename)
    samples.load_class_labels(classes)
    samples.set_current_class(options.target_class)
    target_samples = []
    samples_time = pt.stop()
    print "Loaded samples (%0.2fs)" % (samples_time)

    pt.start()

    rules = load_rules(options.model_filename)
    indexed_rules = rules.remap_feature_to_index(samples)
    training_time = pt.stop()
    newsamples = {}
Ejemplo n.º 9
0
	def crossvalidate(self):
		root_output = self.root_output
		
		if self.outputFilename != None:
			rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S")
			outputPath = self.outputFilename+"-Files"
			if not os.path.exists(outputPath):
				os.makedirs(outputPath)
		

		for replicate in xrange(self.r):
			if MPI_PARALLEL:
				pass
			else:
				replicate_plus_one = replicate+1
				print "Starting replicate %d"%(replicate_plus_one)
				self.replicates.append([])
				self._randomize_sample_set()
				partitions = self._split_sample_set(self.v)
				for i in xrange(self.v):
					self.replicates[replicate].append([])
					training_set, test_set = self._construct_training_and_testing_sets(partitions,i)
					print "Fold %d: training_set: %d, test set: %d"%(i,len(training_set),len(test_set))
					
					if ( self.target_class != None ):
						fileio = FileIO()
						trainingSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_training.set"
						testSetFile =     outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_test.set"
						print "Saving training set to: "+trainingSetFile
						fileio.save_samples(training_set,trainingSetFile)
						print "Saving test set to: "+testSetFile
						fileio.save_samples(test_set,testSetFile)
		
					for test_configuration in self.test_configurations:
						test_name = test_configuration.name
						new_training_set = training_set
						new_test_set = test_set
						if test_configuration.feature_selector:
							features = test_configuration.feature_selector.select(training_set)
							new_training_set = training_set.feature_select(features)
							new_test_set = test_set.feature_select(features)
						model = test_configuration.trainer.train(new_training_set)
						
						if self.outputFilename != None:
							if not hasattr(model, 'write'): # i.e. probably SVM model
								svmModelFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_svm.model"							
								model['svm_model'].save(filename=svmModelFile)
								with open(svmModelFile+".classlabelmap",'a') as outfile:
									pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object
								with open(svmModelFile+".classlabelmapindex",'a') as outfile:
									pickle.dump(model["class_label_map_index"],outfile)
								with open(svmModelFile+".featuremapindex",'w') as outfile:
									pickle.dump(new_training_set.get_index_to_feature(), outfile)
						
						results = test_configuration.classifier.test(new_test_set,model)
						self.replicates[replicate][i].append(results) #order of results same as order of configurations
						
						if ( self.target_class != None ):
							print results.print_classification_log()
							print results
						
						if root_output:
							fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features"%(locals()),"w")
							fout.write("\n".join(features))
							fout.close()
				if root_output:
					
					fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w")
					header_fields = ["sample","fold",self.sample_set.current_class]
					for test_configuration in self.test_configurations:
						header_fields.append(test_configuration.name)
					output_dictionary = {}
					output_lines = ["\t".join(header_fields)]	
					for fold in xrange(self.v):
						for classification_index in xrange(len(self.replicates[replicate][fold][0].classifications_list)):
							main_sample_record = self.replicates[replicate][fold][0].classifications_list[classification_index]
							output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)]
							for test_configuration_index in xrange(len(self.test_configurations)):
								test_sample_record = self.replicates[replicate][fold][test_configuration_index].classifications_list[classification_index]
								output_line.append(str(test_sample_record.predicted_class))
							output_lines.append("\t".join(output_line))
					fout.write("\n".join(output_lines))
					fout.close()
				print "Finished replicate %d"%(replicate_plus_one)
Ejemplo n.º 10
0
	# Check arguments for crucial errors
	errorCount = 0
	if not options.input_samples_filename:
		error("Please provide a genotype sample file with -s /path/to/genotype.file")
		errorCount += 1
	if not options.model_filename:
		error("Please provide a model file for this phenotype with -m /path/to/model.file")
		errorCount += 1
	if not options.target_class:
		error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"")
		errorCount += 1
	if errorCount > 0:
		error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0]))
		exit(1)
	
	fileio = FileIO()
	if options.algorithm == "libsvm.libSVMClassifier": # RVF: part of SVM fix (feature-index map)
		samples = fileio.load_samples(options.input_samples_filename, indexToAttribute=options.model_filename)
	else: #original code
		samples = fileio.load_samples(options.input_samples_filename)
	if options.input_classes_filename:
		classes = fileio.load_classes(options.input_classes_filename)
	else:
		classes = fileio.init_null_classes(options.input_samples_filename, options.target_class)
	
	#RVF
	"""rules = load_rules(options.model_filename) #original code"""
	if options.algorithm == "libsvm.libSVMClassifier":
		m = svm_model(options.model_filename)
		with open(options.model_filename+".classlabelmap", 'rb') as handle:
			clm = pickle.loads(handle.read())
Ejemplo n.º 11
0
    def crossvalidate(self):
        root_output = self.root_output

        if self.outputFilename != None:
            rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S")
            outputPath = self.outputFilename + "-Files"
            if not os.path.exists(outputPath):
                os.makedirs(outputPath)

        for replicate in xrange(self.r):
            if MPI_PARALLEL:
                pass
            else:
                replicate_plus_one = replicate + 1
                print "Starting replicate %d" % (replicate_plus_one)
                self.replicates.append([])
                self._randomize_sample_set()
                self.unmodified = self.unmodified._sort_by_sample_set(
                    self.sample_set)
                partitions = self._split_sample_set(self.v)
                for i in xrange(self.v):
                    self.replicates[replicate].append([])
                    training_set, test_set = self._construct_training_and_testing_sets(
                        partitions, i)
                    print "Fold %d: training_set: %d, test set: %d" % (
                        i, len(training_set), len(test_set))

                    if (self.target_class != None):
                        fileio = FileIO()
                        trainingSetFile = outputPath + "/" + str(
                            self.target_class) + "_R" + str(
                                replicate) + "_F" + str(i) + "_training.set"
                        testSetFile = outputPath + "/" + str(
                            self.target_class) + "_R" + str(
                                replicate) + "_F" + str(i) + "_test.set"
                        print "Saving training set to: " + trainingSetFile
                        fileio.save_samples(training_set, trainingSetFile)
                        print "Saving test set to: " + testSetFile
                        fileio.save_samples(test_set, testSetFile)

                    for test_configuration in self.test_configurations:
                        test_name = test_configuration.name
                        new_training_set = training_set
                        new_test_set = test_set
                        if test_configuration.feature_selector:
                            features = test_configuration.feature_selector.select(
                                training_set)
                            new_training_set = training_set.feature_select(
                                features)
                            new_test_set = test_set.feature_select(features)
                        model = test_configuration.trainer.train(
                            new_training_set)

                        if self.outputFilename != None:
                            if not hasattr(model,
                                           'write'):  # i.e. probably SVM model
                                svmModelFile = outputPath + "/" + str(
                                    self.target_class) + "_R" + str(
                                        replicate) + "_F" + str(
                                            i) + "_svm.model"
                                model['svm_model'].save(filename=svmModelFile)
                                with open(svmModelFile + ".classlabelmap",
                                          'a') as outfile:
                                    pickle.dump(
                                        model["class_label_map"], outfile
                                    )  #fails with model, because of SWIGpy object
                                with open(svmModelFile + ".classlabelmapindex",
                                          'a') as outfile:
                                    pickle.dump(model["class_label_map_index"],
                                                outfile)
                                with open(svmModelFile + ".featuremapindex",
                                          'w') as outfile:
                                    pickle.dump(
                                        new_training_set.get_index_to_feature(
                                        ), outfile)

                        #####################################################################################
                        # add here contamination&completeness
                        #####################################################################################
                        all_class_labels = new_test_set.get_class_labels()
                        sample_attribute_collection = {}
                        for index in all_class_labels:
                            sample_attribute_collection[index] = []

                        for sample in new_test_set.__iter__():
                            temp_attributes_list = list(
                                sample.get_attributes_index_list())
                            sample_attribute_collection[
                                sample.current_class_label].append(
                                    temp_attributes_list)

                        for w in range(0, len(self.completeness)):
                            self.replicates[replicate][i].append([])
                            incomplete_test_set = new_test_set.induce_incompleteness(
                                self.completeness[w])

                            if len(sample_attribute_collection.keys()) != 2:
                                print(sample_attribute_collection.keys())
                                sys.stderr.write(
                                    "Warning: skipping contamination of Fold %i in replicate %i: need exactly 2 different class labels\n"
                                    % (i, replicate))
                                for z in range(0, len(self.contamination)):
                                    self.replicates[replicate][i][w].append([])
                                continue
                            for z in range(0, len(self.contamination)):
                                self.replicates[replicate][i][w].append([])
                                #print(completeness,contamination)
                                contaminated_test_set = incomplete_test_set.introduce_contamination(
                                    sample_attribute_collection,
                                    self.contamination[z])

                                contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set(
                                    new_training_set)
                                #print(dir(model))

                                results = test_configuration.classifier.test(
                                    contaminated_test_set, model)
                                self.replicates[replicate][i][w][z].append(
                                    results
                                )  #order of results same as order of configurations

                                if (self.target_class != None):
                                    print results.print_classification_log()
                                    print results

                                if root_output:
                                    fout = open(
                                        "%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features"
                                        % (locals()), "w")
                                    fout.write("\n".join(features))
                                    fout.close()
                if root_output:

                    fout = open(
                        "%(root_output)s.r%(replicate_plus_one)d.classification.log"
                        % (locals()), "w")
                    header_fields = [
                        "sample", "fold", self.sample_set.current_class
                    ]
                    for test_configuration in self.test_configurations:
                        header_fields.append(test_configuration.name)
                    output_dictionary = {}
                    output_lines = ["\t".join(header_fields)]
                    for fold in xrange(self.v):
                        for classification_index in xrange(
                                len(self.replicates[replicate][fold]
                                    [0].classifications_list)):
                            main_sample_record = self.replicates[replicate][
                                fold][0].classifications_list[
                                    classification_index]
                            output_line = [
                                str(main_sample_record.who),
                                str(fold + 1),
                                str(main_sample_record.true_class)
                            ]
                            for test_configuration_index in xrange(
                                    len(self.test_configurations)):
                                test_sample_record = self.replicates[
                                    replicate][fold][
                                        test_configuration_index].classifications_list[
                                            classification_index]
                                output_line.append(
                                    str(test_sample_record.predicted_class))
                            output_lines.append("\t".join(output_line))
                    fout.write("\n".join(output_lines))
                    fout.close()
                print "Finished replicate %d" % (replicate_plus_one)
Ejemplo n.º 12
0
"""Get the mutual information shared between samples in file 1 with class labels in file 2 with taxonomy levels in file 3 and output to file 4"""

import sys

samples_filename = sys.argv[1]
class_labels_filename = sys.argv[2]
metadata_filename = sys.argv[3]
output_filename = sys.argv[4]

from pica.Sample import SampleSet, ClassLabelSet
from pica.io.FileIO import FileIO
from pica.IntegerMapping import IntegerMapping
from pica.trainers.cwmi.CWMILibrary import CWMILibrary

fileio = FileIO()
cwmilibrary = CWMILibrary()
metadata = fileio.load_metadata(metadata_filename)
samples = fileio.load_samples(samples_filename)
classes = fileio.load_classes(class_labels_filename)
samples.load_class_labels(classes)
confounders = metadata.get_key_list()[1:]

outlines = []
header_line = ["phenotype"]

header_line.extend(confounders)
header_line.append("total")
outlines.append("\t".join(header_line))

for class_name in classes.get_classes():
    "generate phenotype map"
Ejemplo n.º 13
0
    if not options.model_filename:
        error(
            "Please provide a model file for this phenotype with -m /path/to/model.file"
        )
        errorCount += 1
    if not options.target_class:
        error(
            "Please provide the phenotype target to be predicted with -t \"TRAITNAME\""
        )
        errorCount += 1
    if errorCount > 0:
        error("For help on usage, try calling:\n\tpython %s -h" %
              os.path.basename(sys.argv[0]))
        exit(1)

    fileio = FileIO()
    if options.algorithm == "libsvm.libSVMClassifier":  # RVF: part of SVM fix (feature-index map)
        samples = fileio.load_samples(options.input_samples_filename,
                                      indexToAttribute=options.model_filename)
    else:  #original code
        samples = fileio.load_samples(options.input_samples_filename)
    if options.input_classes_filename:
        classes = fileio.load_classes(options.input_classes_filename)
    else:
        classes = fileio.init_null_classes(options.input_samples_filename,
                                           options.target_class)

    #RVF
    """rules = load_rules(options.model_filename) #original code"""
    if options.algorithm == "libsvm.libSVMClassifier":
        m = svm_model(options.model_filename)
Ejemplo n.º 14
0

if __name__ == "__main__":
	pt = ProgramTimer()
	parser = OptionParser(version="%prog 0.8")
	parser.add_option("-s","--samples_filename",help="Read samples from FILE",metavar="FILE")
	parser.add_option("-m","--model_filename",help="Read rules from FILE",metavar="FILE")
	parser.add_option("-o","--output_filename",help="Write selected organisms to FILE",metavar="FILE")
	parser.add_option("-c","--classes_filename",help="Read class labels from FILE",metavar="FILE")
	parser.add_option("-t","--target_class",help="Target class.",metavar="CLASS")
	
	(options, args) = parser.parse_args()

	
	pt.start()
	fileio = FileIO()
	samples = fileio.load_samples(options.samples_filename)
	classes = fileio.load_classes(options.classes_filename)
	samples.load_class_labels(classes)
	samples.set_current_class(options.target_class)
	target_samples = []
	samples_time = pt.stop()
	print "Loaded samples (%0.2fs)"%(samples_time)
	
	pt.start()
	
	rules = load_rules(options.model_filename)
	indexed_rules = rules.remap_feature_to_index(samples)
	training_time = pt.stop()
	newsamples = {}
	
Ejemplo n.º 15
0
def replicateProcess(parametertuple):
            training_set, test_set, target_class, test_configurations, outputFilename, completeness, contamination, root_output, replicate, fold = parametertuple
            w_tot = len(completeness)
            z_tot = len(contamination)
            output=[]
            replicate_plus_one=replicate+1
            print "Fold %d: training_set: %d, test set: %d"%(fold,len(training_set),len(test_set))
            if ( target_class != None ):
                fileio = FileIO()
                trainingSetFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_training.set"
                testSetFile =     outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_test.set"
                print "Saving training set to: "+trainingSetFile
                fileio.save_samples(training_set,trainingSetFile)
                print "Saving test set to: "+testSetFile
                fileio.save_samples(test_set,testSetFile)

            for test_configuration_index in xrange(len(test_configurations)):
                test_configuration=test_configurations[test_configuration_index]
                test_name = test_configuration.name
                new_training_set = training_set
                new_test_set = test_set
                if test_configuration.feature_selector:
                    features = test_configuration.feature_selector.select(training_set)
                    new_training_set = training_set.feature_select(features)
                    new_test_set = test_set.feature_select(features)
                model = test_configuration.trainer.train(new_training_set)

                if outputFilename != None:
                    if not hasattr(model, 'write'): # i.e. probably SVM model
                        svmModelFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_svm.model"                                                   
                        model['svm_model'].save(filename=svmModelFile)
                        with open(svmModelFile+".classlabelmap",'a') as outfile:
                            pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object
                        with open(svmModelFile+".classlabelmapindex",'a') as outfile:
                            pickle.dump(model["class_label_map_index"],outfile)
                        with open(svmModelFile+".featuremapindex",'w') as outfile:
                            pickle.dump(new_training_set.get_index_to_feature(), outfile)


                all_class_labels=new_test_set.get_class_labels()
                sample_attribute_collection={}
                for index in all_class_labels:
                    sample_attribute_collection[index]=[]

                for sample in new_test_set.__iter__():
                    temp_attributes_list=sample.get_attributes_index_list()
                    sample_attribute_collection[sample.current_class_label].append(temp_attributes_list)



                for w in xrange(w_tot):
                    output.append([])
                    incomplete_test_set = new_test_set.induce_incompleteness(completeness[w])
                    err=0
                    for z in xrange(z_tot):
                        #output[w][z].append([])
                        if round(contamination[z],1) == 0.0:
                            results = test_configuration.classifier.test(incomplete_test_set.map_test_set_attributes_to_training_set(new_training_set),model)
                            summary = ClassificationSummary(results)
                            output[w].append(summary)

                        elif len(sample_attribute_collection.keys())==2:
                            #do crosscontamination if exactly 2 class labels given
                            contaminated_test_set = incomplete_test_set.introduce_contamination(sample_attribute_collection,contamination[z])

                            contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set(new_training_set)
                              
                            results = test_configuration.classifier.test(contaminated_test_set,model)
                            summary = ClassificationSummary(results)
                            output[w].append(summary)

                            if ( target_class != None ):
                                print results.print_classification_log()
                                print results

                            if root_output:
                                fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(fold)d.%(test_name)s.features"%(locals()),"w")
                                fout.write("\n".join(features))
                                fout.close()
                        elif err==0:
                            sys.stderr.write("Warning: skipping contamination of fold %i of replicate %i: exactly 2 different class labels needed!"%(fold,replicate))
                            err=1

                        #print(replicates[replicate][fold][w][z])
                                
#            if root_output:

#                fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w")
#                header_fields = ["sample","fold",sample_set.current_class]
#                for test_configuration in test_configurations:
#                      header_fields.append(test_configuration.name)
#                output_dictionary = {}
#                output_lines = ["\t".join(header_fields)]
#                for fold in xrange(v):
#                      for classification_index in xrange(len(replicates[replicate][fold][0].classifications_list)):
#                              main_sample_record = replicates[replicate][fold][0].classifications_list[classification_index]
#                              output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)]
#                              for test_configuration_index in xrange(len(test_configurations)):
#                                      test_sample_record = replicates[replicate][fold][test_configuration_index].classifications_list[classification_index]
#                                      output_line.append(str(test_sample_record.predicted_class))
#                              output_lines.append("\t".join(output_line))
#                fout.write("\n".join(output_lines))
#                fout.close()
            print "Finished replicate %d, fold %d"%(replicate_plus_one,fold)
            return output