def gappy_pair_data(sequences, maximum_distance, **args) : """generate a dataset object that contains all pairs of letters in a sequence that are within a certain distance of each other. :Parameters: - `sequences` - a list of sequences from which to construct the gappy pair representation or a Fasta file that contains the sequences - `maximum_distance` - the maximum distance between pairs of length of the substrings to consider :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the features """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] skip = [] if 'skip' in args : skip = args['skip'] if type(sequences) == type('') : sequences = fasta_read(sequences) data = SparseDataSet(generate_gappy_pairs(sequences, maximum_distance, prefix, skip)) if normalize : data.normalize(2) return data
def gappy_pair_data(sequences, maximum_distance, **args): """generate a dataset object that contains all pairs of letters in a sequence that are within a certain distance of each other. :Parameters: - `sequences` - a list of sequences from which to construct the gappy pair representation or a Fasta file that contains the sequences - `maximum_distance` - the maximum distance between pairs of length of the substrings to consider :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the features """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] skip = [] if 'skip' in args: skip = args['skip'] if type(sequences) == type(''): sequences = fasta_read(sequences) data = SparseDataSet( generate_gappy_pairs(sequences, maximum_distance, prefix, skip)) if normalize: data.normalize(2) return data
def spectrum_data(sequences, k1, k2=None, **args): """generate a dataset object that represents the spectrum of a sequence, i.e. its kernel function is the spectrum kernel. Reference: C. Leslie, E. Eskin, and WS Noble. The spectrum kernel: A string kernel for SVM protein classification. :Parameters: - `sequences` - either a name of a fasta file that contains the sequences or a list of sequences - `k1` - the length of the substrings to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the spectrum. :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `mismatch` - whether to allow a single mismatch [default: False] """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] if k2 is None: k2 = k1 + 1 skip = [] if 'skip' in args: skip = args['skip'] mismatch = False if 'mismatch' in args: mismatch = args['mismatch'] if mismatch: spectrum_generator = generate_single_mismatch_spectrum else: spectrum_generator = generate_spectrum if type(sequences) == type(''): sequences = fasta_read(sequences) data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip)) if normalize: data.normalize(2) for k in range(k1 + 1, k2 + 1): data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip)) if normalize: data2.normalize(2) data.addFeatures(data2) return data
def __init__(self, lexicon, C=1, num_features=100): self.training_set = None self.classes = None self.test_set = None self.results = None self.kernel = ker.Linear() self.C = C self.feature_data = PATH + "/learning/stored/feature.data" self.label_data = PATH + "/learning/stored/svm_label.data" self.lexicon = lexicon self.num_features = len(self.lexicon.words.keys()) try: print "Loading existing SVM..." features = pickle.load(open(self.feature_data)) labels = pickle.load(open(self.label_data)) sparsedata = SparseDataSet(features, L=labels) self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata) except Exception as e: print e print "Existing SVM not found!" self.svm_classifier = svm.SVM(self.kernel) self.accuracy = None self.predicted_labels = None score = featsel.FeatureScore('golub') self.filter = featsel.Filter(score) self.feature_selector = FeatureSelect(self.svm_classifier, self.filter) self.chain = Chain([self.feature_selector, self.svm_classifier])
def train(self, training_set, labels): print Tcolors.ACT + " Training SVM with chaining..." features = self.compute_features(training_set) data = SparseDataSet(features, L=labels) print Tcolors.CYAN self.training_set = data self.svm_classifier.train(data) self.save(data,features,labels) print Tcolors.C
def classify(self, sentences, labels): self.test_set = self.compute_features(sentences) print print Tcolors.ACT + " Classifying instance with SVM: " + Tcolors.RED + sentences[0] + Tcolors.C print Tcolors.HEADER test_data = SparseDataSet(self.test_set, L=labels) self.results = self.svm_classifier.test(test_data) print Tcolors.C return self.results
def spectrum_data(sequences, k1, k2=None, **args) : """generate a dataset object that represents the spectrum of a sequence, i.e. its kernel function is the spectrum kernel. Reference: C. Leslie, E. Eskin, and WS Noble. The spectrum kernel: A string kernel for SVM protein classification. :Parameters: - `sequences` - either a name of a fasta file that contains the sequences or a list of sequences - `k1` - the length of the substrings to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the spectrum. :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `mismatch` - whether to allow a single mismatch [default: False] """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] if k2 is None : k2 = k1 + 1 skip = [] if 'skip' in args : skip = args['skip'] mismatch = False if 'mismatch' in args : mismatch = args['mismatch'] if mismatch : spectrum_generator = generate_single_mismatch_spectrum else : spectrum_generator = generate_spectrum if type(sequences) == type('') : sequences = fasta_read(sequences) data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip)) if normalize : data.normalize(2) for k in range(k1+1, k2 + 1) : data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip)) if normalize : data2.normalize(2) data.addFeatures(data2) return data
def positional_kmer_data(sequences, k1, k2=None, **args) : """generate a dataset object that represents kmers that occur in specific positions. When using weighting, this is essentially the 'weighted degree' kernel of Sonenburg et al. :Parameters: - `sequences` - the name of a fasta file that contains the sequences or a list of sequences - `k1` - the smallest length kmer to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the kernel :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `weighted` - whether to use the weighting of sonenborg et al [default: equal weights] - `shift` - whether to consider a shift [default: False] - `shift_start` - the position in the seq to start shifting [default: 0] - `shift_end` - the position in the sequence to stop shifting [default: end of sequence] """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] if k2 is None : k2 = k1 + 1 skip = [] if 'skip' in args : skip = args['skip'] shift = False if 'shift' in args : shift = args['shift'] shift_start = 0 if 'shift_start' in args : shift_start = args['shift_start'] if type(sequences) == type('') : sequences = fasta_read(sequences) if 'shift_end' in args : shift_end = args['shift_end'] else : shift_end = len(sequences[0]) - 1 weighted = False if weighted in args : weighted = args['weighted'] if weighted : weights = [1.0 for i in range(k1, k2 + 1)] else : weights = [1.0 for i in range(k1, k2 + 1)] data = SparseDataSet(generate_positional_kmers( sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end)) if normalize : data.normalize(2) for k in range(k1 + 1, k2 + 1) : data2 = SparseDataSet(generate_positional_kmers( sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end)) if normalize : data2.normalize(2) data.addFeatures(data2) return data
def positional_kmer_data(sequences, k1, k2=None, **args): """generate a dataset object that represents kmers that occur in specific positions. When using weighting, this is essentially the 'weighted degree' kernel of Sonenburg et al. :Parameters: - `sequences` - the name of a fasta file that contains the sequences or a list of sequences - `k1` - the smallest length kmer to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the kernel :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `weighted` - whether to use the weighting of sonenborg et al [default: equal weights] - `shift` - whether to consider a shift [default: False] - `shift_start` - the position in the seq to start shifting [default: 0] - `shift_end` - the position in the sequence to stop shifting [default: end of sequence] """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] if k2 is None: k2 = k1 + 1 skip = [] if 'skip' in args: skip = args['skip'] shift = False if 'shift' in args: shift = args['shift'] shift_start = 0 if 'shift_start' in args: shift_start = args['shift_start'] if type(sequences) == type(''): sequences = fasta_read(sequences) if 'shift_end' in args: shift_end = args['shift_end'] else: shift_end = len(sequences[0]) - 1 weighted = False if weighted in args: weighted = args['weighted'] if weighted: weights = [1.0 for i in range(k1, k2 + 1)] else: weights = [1.0 for i in range(k1, k2 + 1)] data = SparseDataSet( generate_positional_kmers(sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end)) if normalize: data.normalize(2) for k in range(k1 + 1, k2 + 1): data2 = SparseDataSet( generate_positional_kmers(sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end)) if normalize: data2.normalize(2) data.addFeatures(data2) return data