Ejemplo n.º 1
0
def gappy_pair_data(sequences, maximum_distance, **args) :
    """generate a dataset object that contains all pairs of letters in a sequence that are
    within a certain distance of each other.

    :Parameters:
      - `sequences` - a list of sequences from which to construct the gappy pair
        representation or a Fasta file that contains the sequences
      - `maximum_distance` - the maximum distance between pairs of  length of the
        substrings to consider

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the features
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    skip = []
    if 'skip' in args :
        skip = args['skip']
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    data = SparseDataSet(generate_gappy_pairs(sequences, maximum_distance, prefix, skip))
    if normalize :
        data.normalize(2)
    return data
Ejemplo n.º 2
0
def gappy_pair_data(sequences, maximum_distance, **args):
    """generate a dataset object that contains all pairs of letters in a sequence that are
    within a certain distance of each other.

    :Parameters:
      - `sequences` - a list of sequences from which to construct the gappy pair
        representation or a Fasta file that contains the sequences
      - `maximum_distance` - the maximum distance between pairs of  length of the
        substrings to consider

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the features
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    skip = []
    if 'skip' in args:
        skip = args['skip']
    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    data = SparseDataSet(
        generate_gappy_pairs(sequences, maximum_distance, prefix, skip))
    if normalize:
        data.normalize(2)
    return data
Ejemplo n.º 3
0
def spectrum_data(sequences, k1, k2=None, **args):
    """generate a dataset object that represents the spectrum of a sequence,
    i.e. its kernel function is the spectrum kernel.
    Reference:
    C. Leslie, E. Eskin, and WS Noble. 
    The spectrum kernel:  A string kernel for SVM protein classification.

    :Parameters:
      - `sequences` - either a name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the length of the substrings to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the spectrum.

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `mismatch` - whether to allow a single mismatch [default: False]
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    if k2 is None:
        k2 = k1 + 1
    skip = []
    if 'skip' in args:
        skip = args['skip']
    mismatch = False
    if 'mismatch' in args:
        mismatch = args['mismatch']
    if mismatch:
        spectrum_generator = generate_single_mismatch_spectrum
    else:
        spectrum_generator = generate_spectrum
    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip))
    if normalize:
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1):
        data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip))
        if normalize:
            data2.normalize(2)
        data.addFeatures(data2)

    return data
Ejemplo n.º 4
0
 def __init__(self, lexicon, C=1, num_features=100):
     self.training_set = None
     self.classes = None 
     self.test_set = None
     self.results = None
     self.kernel = ker.Linear()
     self.C = C  
     self.feature_data = PATH + "/learning/stored/feature.data"
     self.label_data = PATH + "/learning/stored/svm_label.data"
     self.lexicon = lexicon
     self.num_features = len(self.lexicon.words.keys())
     try:
         print "Loading existing SVM..."
         features = pickle.load(open(self.feature_data))
         labels = pickle.load(open(self.label_data))
         sparsedata = SparseDataSet(features, L=labels) 
         self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata)
     except Exception as e:
         print e
         print "Existing SVM not found!"
         self.svm_classifier = svm.SVM(self.kernel)
     self.accuracy = None
     self.predicted_labels = None
     score = featsel.FeatureScore('golub')
     self.filter = featsel.Filter(score)
     self.feature_selector = FeatureSelect(self.svm_classifier, self.filter)
     self.chain = Chain([self.feature_selector, self.svm_classifier])
Ejemplo n.º 5
0
 def train(self, training_set, labels):
     print Tcolors.ACT + " Training SVM with chaining..."
     features = self.compute_features(training_set) 
     data = SparseDataSet(features, L=labels) 
     print Tcolors.CYAN
     self.training_set = data 
     self.svm_classifier.train(data)     
     self.save(data,features,labels)
     print Tcolors.C
Ejemplo n.º 6
0
 def classify(self, sentences, labels):
     self.test_set = self.compute_features(sentences)
     print
     print Tcolors.ACT + " Classifying instance with SVM: " + Tcolors.RED + sentences[0] + Tcolors.C
     print Tcolors.HEADER
     test_data = SparseDataSet(self.test_set, L=labels)
     self.results = self.svm_classifier.test(test_data)
     print Tcolors.C 
     return self.results
Ejemplo n.º 7
0
def spectrum_data(sequences, k1, k2=None, **args) :
    """generate a dataset object that represents the spectrum of a sequence,
    i.e. its kernel function is the spectrum kernel.
    Reference:
    C. Leslie, E. Eskin, and WS Noble. 
    The spectrum kernel:  A string kernel for SVM protein classification.

    :Parameters:
      - `sequences` - either a name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the length of the substrings to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the spectrum.

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `mismatch` - whether to allow a single mismatch [default: False]
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    if k2 is None : 
        k2 = k1 + 1
    skip = []
    if 'skip' in args :
        skip = args['skip']
    mismatch = False
    if 'mismatch' in args :
        mismatch = args['mismatch']
    if mismatch :
        spectrum_generator = generate_single_mismatch_spectrum
    else :
        spectrum_generator = generate_spectrum
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip))
    if normalize :
        data.normalize(2)
    for k in range(k1+1, k2 + 1) :
        data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip))
        if normalize :
            data2.normalize(2)
        data.addFeatures(data2)

    return data
Ejemplo n.º 8
0
def positional_kmer_data(sequences, k1, k2=None, **args) :
    """generate a dataset object that represents kmers that occur in specific
    positions.  When using weighting, this is essentially the 'weighted degree'
    kernel of Sonenburg et al.

    :Parameters:
      - `sequences` - the name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the smallest length kmer to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the kernel

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `weighted` - whether to use the weighting of sonenborg et al
        [default:  equal weights]
      - `shift` - whether to consider a shift [default: False]
      - `shift_start` - the position in the seq to start shifting [default: 0]
      - `shift_end` - the position in the sequence to stop shifting
        [default: end of sequence]
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    if k2 is None : 
        k2 = k1 + 1
    skip = []
    if 'skip' in args :
        skip = args['skip']
    shift = False
    if 'shift' in args :
        shift = args['shift']
    shift_start = 0
    if 'shift_start' in args :
        shift_start = args['shift_start']
        
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    if 'shift_end' in args :
        shift_end = args['shift_end']
    else :
        shift_end = len(sequences[0]) - 1

    weighted = False
    if weighted in args :
        weighted = args['weighted']
    if weighted :
        weights = [1.0 for i in range(k1, k2 + 1)]
    else :
        weights = [1.0 for i in range(k1, k2 + 1)]
    data = SparseDataSet(generate_positional_kmers(
        sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end))
    if normalize :
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1) :
        data2 = SparseDataSet(generate_positional_kmers(
            sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end))
        if normalize :
            data2.normalize(2)
        data.addFeatures(data2)

    return data
Ejemplo n.º 9
0
def positional_kmer_data(sequences, k1, k2=None, **args):
    """generate a dataset object that represents kmers that occur in specific
    positions.  When using weighting, this is essentially the 'weighted degree'
    kernel of Sonenburg et al.

    :Parameters:
      - `sequences` - the name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the smallest length kmer to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the kernel

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `weighted` - whether to use the weighting of sonenborg et al
        [default:  equal weights]
      - `shift` - whether to consider a shift [default: False]
      - `shift_start` - the position in the seq to start shifting [default: 0]
      - `shift_end` - the position in the sequence to stop shifting
        [default: end of sequence]
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    if k2 is None:
        k2 = k1 + 1
    skip = []
    if 'skip' in args:
        skip = args['skip']
    shift = False
    if 'shift' in args:
        shift = args['shift']
    shift_start = 0
    if 'shift_start' in args:
        shift_start = args['shift_start']

    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    if 'shift_end' in args:
        shift_end = args['shift_end']
    else:
        shift_end = len(sequences[0]) - 1

    weighted = False
    if weighted in args:
        weighted = args['weighted']
    if weighted:
        weights = [1.0 for i in range(k1, k2 + 1)]
    else:
        weights = [1.0 for i in range(k1, k2 + 1)]
    data = SparseDataSet(
        generate_positional_kmers(sequences, k1, prefix, skip, weights[0],
                                  shift, shift_start, shift_end))
    if normalize:
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1):
        data2 = SparseDataSet(
            generate_positional_kmers(sequences, k, prefix, skip,
                                      weights[k - k1], shift, shift_start,
                                      shift_end))
        if normalize:
            data2.normalize(2)
        data.addFeatures(data2)

    return data