def gappy_pair_data(sequences, maximum_distance, **args) :
    """generate a dataset object that contains all pairs of letters in a sequence that are
    within a certain distance of each other.

    :Parameters:
      - `sequences` - a list of sequences from which to construct the gappy pair
        representation or a Fasta file that contains the sequences
      - `maximum_distance` - the maximum distance between pairs of  length of the
        substrings to consider

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the features
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    skip = []
    if 'skip' in args :
        skip = args['skip']
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    data = SparseDataSet(generate_gappy_pairs(sequences, maximum_distance, prefix, skip))
    if normalize :
        data.normalize(2)
    return data
def gappy_pair_data(sequences, maximum_distance, **args):
    """generate a dataset object that contains all pairs of letters in a sequence that are
    within a certain distance of each other.

    :Parameters:
      - `sequences` - a list of sequences from which to construct the gappy pair
        representation or a Fasta file that contains the sequences
      - `maximum_distance` - the maximum distance between pairs of  length of the
        substrings to consider

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the features
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    skip = []
    if 'skip' in args:
        skip = args['skip']
    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    data = SparseDataSet(
        generate_gappy_pairs(sequences, maximum_distance, prefix, skip))
    if normalize:
        data.normalize(2)
    return data
def spectrum_data(sequences, k1, k2=None, **args) :
    """generate a dataset object that represents the spectrum of a sequence,
    i.e. its kernel function is the spectrum kernel.
    Reference:
    C. Leslie, E. Eskin, and WS Noble. 
    The spectrum kernel:  A string kernel for SVM protein classification.

    :Parameters:
      - `sequences` - either a name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the length of the substrings to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the spectrum.

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `mismatch` - whether to allow a single mismatch [default: False]
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    if k2 is None : 
        k2 = k1 + 1
    skip = []
    if 'skip' in args :
        skip = args['skip']
    mismatch = False
    if 'mismatch' in args :
        mismatch = args['mismatch']
    if mismatch :
        spectrum_generator = generate_single_mismatch_spectrum
    else :
        spectrum_generator = generate_spectrum
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip))
    if normalize :
        data.normalize(2)
    for k in range(k1+1, k2 + 1) :
        data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip))
        if normalize :
            data2.normalize(2)
        data.addFeatures(data2)

    return data
def spectrum_data(sequences, k1, k2=None, **args):
    """generate a dataset object that represents the spectrum of a sequence,
    i.e. its kernel function is the spectrum kernel.
    Reference:
    C. Leslie, E. Eskin, and WS Noble. 
    The spectrum kernel:  A string kernel for SVM protein classification.

    :Parameters:
      - `sequences` - either a name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the length of the substrings to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the spectrum.

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `mismatch` - whether to allow a single mismatch [default: False]
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    if k2 is None:
        k2 = k1 + 1
    skip = []
    if 'skip' in args:
        skip = args['skip']
    mismatch = False
    if 'mismatch' in args:
        mismatch = args['mismatch']
    if mismatch:
        spectrum_generator = generate_single_mismatch_spectrum
    else:
        spectrum_generator = generate_spectrum
    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip))
    if normalize:
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1):
        data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip))
        if normalize:
            data2.normalize(2)
        data.addFeatures(data2)

    return data
def positional_kmer_data(sequences, k1, k2=None, **args) :
    """generate a dataset object that represents kmers that occur in specific
    positions.  When using weighting, this is essentially the 'weighted degree'
    kernel of Sonenburg et al.

    :Parameters:
      - `sequences` - the name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the smallest length kmer to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the kernel

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `weighted` - whether to use the weighting of sonenborg et al
        [default:  equal weights]
      - `shift` - whether to consider a shift [default: False]
      - `shift_start` - the position in the seq to start shifting [default: 0]
      - `shift_end` - the position in the sequence to stop shifting
        [default: end of sequence]
    """
    prefix = ''
    if 'prefix' in args :
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args :
        normalize = args['normalize']
    if k2 is None : 
        k2 = k1 + 1
    skip = []
    if 'skip' in args :
        skip = args['skip']
    shift = False
    if 'shift' in args :
        shift = args['shift']
    shift_start = 0
    if 'shift_start' in args :
        shift_start = args['shift_start']
        
    if type(sequences) == type('') :
        sequences = fasta_read(sequences)

    if 'shift_end' in args :
        shift_end = args['shift_end']
    else :
        shift_end = len(sequences[0]) - 1

    weighted = False
    if weighted in args :
        weighted = args['weighted']
    if weighted :
        weights = [1.0 for i in range(k1, k2 + 1)]
    else :
        weights = [1.0 for i in range(k1, k2 + 1)]
    data = SparseDataSet(generate_positional_kmers(
        sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end))
    if normalize :
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1) :
        data2 = SparseDataSet(generate_positional_kmers(
            sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end))
        if normalize :
            data2.normalize(2)
        data.addFeatures(data2)

    return data
def positional_kmer_data(sequences, k1, k2=None, **args):
    """generate a dataset object that represents kmers that occur in specific
    positions.  When using weighting, this is essentially the 'weighted degree'
    kernel of Sonenburg et al.

    :Parameters:
      - `sequences` - the name of a fasta file that contains the sequences or
         a list of sequences
      - `k1` - the smallest length kmer to consider
      - `k2` - if k2 is provided then strings whose length is between k1
        and k2 are used in constructing the kernel

    :Keywords:
      - `normalize` - whether to normalize the dataset [default: True]
      - `prefix` - a string to be added to the name of each feature 
        (useful when combining spectrum features from several sources)
      - `skip` - a list of characters that should be skipped in computing
        the spectrum [default: []].  Whenever a character in this list is
        encountered, the substring is not included in the spectrum
      - `weighted` - whether to use the weighting of sonenborg et al
        [default:  equal weights]
      - `shift` - whether to consider a shift [default: False]
      - `shift_start` - the position in the seq to start shifting [default: 0]
      - `shift_end` - the position in the sequence to stop shifting
        [default: end of sequence]
    """
    prefix = ''
    if 'prefix' in args:
        prefix = args['prefix']
    normalize = True
    if 'normalize' in args:
        normalize = args['normalize']
    if k2 is None:
        k2 = k1 + 1
    skip = []
    if 'skip' in args:
        skip = args['skip']
    shift = False
    if 'shift' in args:
        shift = args['shift']
    shift_start = 0
    if 'shift_start' in args:
        shift_start = args['shift_start']

    if type(sequences) == type(''):
        sequences = fasta_read(sequences)

    if 'shift_end' in args:
        shift_end = args['shift_end']
    else:
        shift_end = len(sequences[0]) - 1

    weighted = False
    if weighted in args:
        weighted = args['weighted']
    if weighted:
        weights = [1.0 for i in range(k1, k2 + 1)]
    else:
        weights = [1.0 for i in range(k1, k2 + 1)]
    data = SparseDataSet(
        generate_positional_kmers(sequences, k1, prefix, skip, weights[0],
                                  shift, shift_start, shift_end))
    if normalize:
        data.normalize(2)
    for k in range(k1 + 1, k2 + 1):
        data2 = SparseDataSet(
            generate_positional_kmers(sequences, k, prefix, skip,
                                      weights[k - k1], shift, shift_start,
                                      shift_end))
        if normalize:
            data2.normalize(2)
        data.addFeatures(data2)

    return data