Example #1
0
def p_distance(seq1, seq2):
    '''Modified hamming distance to include only non-gap sites'''
    from skbio.sequence import Sequence
    from numpy import isnan
    myseq1 = str(seq1)
    myseq2 = str(seq2)

    degapped1 = []
    degapped2 = []

    for i in range(len(myseq1)):
        if myseq1[i] != "-":
            if myseq2[i] != "-":
                degapped1.append(myseq1[i])
                degapped2.append(myseq2[i])
    degapped1 = "".join(degapped1)
    degapped2 = "".join(degapped2)

    #sys.stderr.write(degapped1)
    #sys.stderr.write(degapped2)

    hamming_dist = hamming(Sequence(degapped1), Sequence(degapped2))
    #sys.stderr.write(hamming_dist)
    if isnan(hamming_dist):
        #sys.stderr.write(seq1.metadata["id"], seq2.metadata["id"])
        return 0.0
    else:
        return hamming_dist
Example #2
0
def _coerce_alignment_input_type(seq, disallow_alignment):
    """ Converts variety of types into an skbio.Alignment object
    """
    if isinstance(seq, string_types):
        return Alignment([Sequence(seq, metadata={'id': ''})])
    elif isinstance(seq, Sequence):
        if 'id' in seq.metadata:
            return Alignment([seq])
        else:
            seq = seq.copy()
            seq.metadata['id'] = ''
            return Alignment([seq])
    elif isinstance(seq, Alignment):
        if disallow_alignment:
            # This will disallow aligning either a pair of alignments, or an
            # alignment and a sequence. We don't currently support this for
            # local alignment as there is not a clear usecase, and it's also
            # not exactly clear how this would work.
            raise TypeError("Aligning alignments is not currently supported "
                            "with the aligner function that you're calling.")
        else:
            return seq
    else:
        raise TypeError("Unsupported type provided to aligner: %r." %
                        type(seq))
Example #3
0
def _clustal_to_alignment(fh, strict=True):
    r"""yields labels and sequences from msa (multiple sequence alignment)

    Parameters
    ----------

    fh : open file object
        An open Clustal file.
    strict : boolean
        Whether or not to raise a ``ClustalFormatError``
        when no labels are found.

    Returns
    -------
    skbio.Alignment
        Alignment object containing aligned biogical sequences

    Raises
    ------
        skbio.util.exception.ClustalFormatError
            If the sequences in `fh` don't have the same sequence length
            or if the sequence ids don't properly match with the subsequences
    Notes
    -----

    Skips any line that starts with a blank.

    ``_clustal_to_alignment`` preserves the order of the sequences from the
    original file.  However, it does use a dict as an intermediate, so
    two sequences can't have the same label. This is probably OK since
    Clustal will refuse to run on a FASTA file in which two sequences have
    the same label, but could potentially cause trouble with manually
    edited files (all the segments of the conflicting sequences would
    be interleaved, possibly in an unpredictable way).

    If the lines have trailing numbers (i.e. Clustal was run with
    `-LINENOS=ON`), silently deletes them. Does not check that the numbers
    actually correspond to the number of chars in the sequence printed so far.

    References
    ----------
    .. [1] Thompson JD, Higgins DG, Gibson TJ,  "CLUSTAL W: improving the
        sensitivity of progressive multiple sequence alignment through sequence
        weighting, position-specific gap penalties and weight matrix choice.
        Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80.

    """

    records = map(_delete_trailing_number,
                  filter(_is_clustal_seq_line, fh))
    data, labels = _label_line_parser(records, strict)

    aligned_correctly = _check_length(data, labels)
    if not aligned_correctly:
        raise ClustalFormatError("Sequences not aligned properly")
    alns = []
    for key in labels:
        alns.append(Sequence(sequence=''.join(data[key]),
                             metadata={'id': key}))
    return Alignment(alns)
Example #4
0
    def distance_to(self, other):
        """return the distance between two sequence

        this method return the distance between two sequence using
        skbio Sequence class and and distance method. While this
        method can only compute distance between sequence with the same
        length we will delete some of the nucleotide of the longest
        sequence.

        """
        if len(self.sequence) > len(other.sequence):
            cut = len(self.sequence) - len(other.sequence)
            self.sequence = self.sequence[0:-cut]
        elif len(other.sequence) > len(self.sequence):
            cut = len(other.sequence) - len(self.sequence)
            other.sequence = other.sequence[0:-cut]
        else:
            pass

        skseq_self = SkSequence(self.sequence)
        skseq_other = SkSequence(other.sequence)

        return skseq_self.distance(skseq_other)
Example #5
0
def align_sequences(seqs):
    import io
    from subprocess import run, PIPE
    from skbio.io import read, write
    from skbio.sequence import Sequence

    fasta = 'rational_designs.fa'
    seqs = (Sequence(x) for x in seqs)
    write(seqs, format='fasta', into=fasta)

    clustalo = 'clustalo', '-i', fasta
    stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout
    stdout_io = io.StringIO(stdout)
    msa = read(stdout_io, format='fasta')

    return [str(x) for x in msa]
Example #6
0
 def _get_position(self, i):
     seq = Sequence.concat([s[i] for s in self._seqs], how='outer')
     # TODO: when positional metadata exists, do something like below:
     # if self.has_positional_metadata():
     #     seq.metadata = dict(self.positional_metadata.iloc[i])
     return seq
Example #7
0
def align2skbio(align):
    return skbio.TabularMSA(
        [Sequence(s, metadata=dict(id=str(i))) for i, s in align.items()])
Example #8
0
 def _get_position(self, i):
     seq = Sequence.concat([s[i] for s in self._seqs], how='outer')
     if self.has_positional_metadata():
         seq.metadata = dict(self.positional_metadata.iloc[i])
     return seq
Example #9
0
 def _get_position(self, i):
     seq = Sequence.concat([s[i] for s in self._seqs], how='outer')
     # TODO: when positional metadata exists, do something like below:
     # if self.has_positional_metadata():
     #     seq.metadata = dict(self.positional_metadata.iloc[i])
     return seq
Example #10
0
def _traceback(traceback_matrix,
               score_matrix,
               aln1,
               aln2,
               start_row,
               start_col,
               gap_character='-'):
    # cache some values for simpler
    aend = _traceback_encoding['alignment-end']
    match = _traceback_encoding['match']
    vgap = _traceback_encoding['vertical-gap']
    hgap = _traceback_encoding['horizontal-gap']

    # initialize the result alignments
    aln1_sequence_count = aln1.sequence_count()
    aligned_seqs1 = [[] for e in range(aln1_sequence_count)]

    aln2_sequence_count = aln2.sequence_count()
    aligned_seqs2 = [[] for e in range(aln2_sequence_count)]

    current_row = start_row
    current_col = start_col

    best_score = score_matrix[current_row, current_col]
    current_value = None

    while current_value != aend:
        current_value = traceback_matrix[current_row, current_col]

        if current_value == match:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col - 1]))
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row - 1]))
            current_row -= 1
            current_col -= 1
        elif current_value == vgap:
            for aligned_seq in aligned_seqs1:
                aligned_seq.append('-')
            for aligned_seq, input_seq in zip(aligned_seqs2, aln2):
                aligned_seq.append(str(input_seq[current_row - 1]))
            current_row -= 1
        elif current_value == hgap:
            for aligned_seq, input_seq in zip(aligned_seqs1, aln1):
                aligned_seq.append(str(input_seq[current_col - 1]))
            for aligned_seq in aligned_seqs2:
                aligned_seq.append('-')
            current_col -= 1
        elif current_value == aend:
            continue
        else:
            raise ValueError("Invalid value in traceback matrix: %s" %
                             current_value)

    for i in range(aln1_sequence_count):
        aligned_seq = ''.join(aligned_seqs1[i][::-1])
        seq_id = _get_seq_id(aln1[i], str(i))
        aligned_seqs1[i] = Sequence(aligned_seq, metadata={'id': seq_id})

    for i in range(aln2_sequence_count):
        aligned_seq = ''.join(aligned_seqs2[i][::-1])
        seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count))
        aligned_seqs2[i] = Sequence(aligned_seq, metadata={'id': seq_id})

    return (aligned_seqs1, aligned_seqs2, best_score, current_col, current_row)
Example #11
0
 def _get_position(self, i):
     seq = Sequence.concat([s[i] for s in self._seqs], how='outer')
     if self.has_positional_metadata():
         seq.metadata = dict(self.positional_metadata.iloc[i])
     return seq