コード例 #1
0
ファイル: _pairwise.py プロジェクト: liupfskygre/scikit-bio
def _coerce_alignment_input_type(seq, disallow_alignment):
    """ Converts variety of types into an skbio.Alignment object
    """
    if isinstance(seq, string_types):
        return Alignment([Sequence(seq, metadata={'id': ''})])
    elif isinstance(seq, Sequence):
        if 'id' in seq.metadata:
            return Alignment([seq])
        else:
            seq = seq.copy()
            seq.metadata['id'] = ''
            return Alignment([seq])
    elif isinstance(seq, Alignment):
        if disallow_alignment:
            # This will disallow aligning either a pair of alignments, or an
            # alignment and a sequence. We don't currently support this for
            # local alignment as there is not a clear usecase, and it's also
            # not exactly clear how this would work.
            raise TypeError("Aligning alignments is not currently supported "
                            "with the aligner function that you're calling.")
        else:
            return seq
    else:
        raise TypeError("Unsupported type provided to aligner: %r." %
                        type(seq))
コード例 #2
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
コード例 #3
0
ファイル: align_seqs.py プロジェクト: Kleptobismol/qiime
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
コード例 #4
0
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
コード例 #5
0
ファイル: filter_alignment.py プロジェクト: shiffer1/qiime
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(
        round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
コード例 #6
0
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(round((len(uncertainty_sorted) - 1) *
                             (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
コード例 #7
0
ファイル: make_phylogeny.py プロジェクト: Kleptobismol/qiime
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]),
                DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
コード例 #8
0
ファイル: filter_alignment.py プロジェクト: DSWallach/qiime
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
コード例 #9
0
def _fasta_to_alignment(fh, qual=FileSentinel, constructor=Sequence, **kwargs):
    return Alignment(
        list(
            _fasta_to_generator(fh,
                                qual=qual,
                                constructor=constructor,
                                **kwargs)))
コード例 #10
0
ファイル: clustal.py プロジェクト: liupfskygre/scikit-bio
def _clustal_to_alignment(fh, strict=True):
    r"""yields labels and sequences from msa (multiple sequence alignment)

    Parameters
    ----------

    fh : open file object
        An open Clustal file.
    strict : boolean
        Whether or not to raise a ``ClustalFormatError``
        when no labels are found.

    Returns
    -------
    skbio.Alignment
        Alignment object containing aligned biogical sequences

    Raises
    ------
        skbio.util.exception.ClustalFormatError
            If the sequences in `fh` don't have the same sequence length
            or if the sequence ids don't properly match with the subsequences
    Notes
    -----

    Skips any line that starts with a blank.

    ``_clustal_to_alignment`` preserves the order of the sequences from the
    original file.  However, it does use a dict as an intermediate, so
    two sequences can't have the same label. This is probably OK since
    Clustal will refuse to run on a FASTA file in which two sequences have
    the same label, but could potentially cause trouble with manually
    edited files (all the segments of the conflicting sequences would
    be interleaved, possibly in an unpredictable way).

    If the lines have trailing numbers (i.e. Clustal was run with
    `-LINENOS=ON`), silently deletes them. Does not check that the numbers
    actually correspond to the number of chars in the sequence printed so far.

    References
    ----------
    .. [1] Thompson JD, Higgins DG, Gibson TJ,  "CLUSTAL W: improving the
        sensitivity of progressive multiple sequence alignment through sequence
        weighting, position-specific gap penalties and weight matrix choice.
        Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80.

    """

    records = map(_delete_trailing_number,
                  filter(_is_clustal_seq_line, fh))
    data, labels = _label_line_parser(records, strict)

    aligned_correctly = _check_length(data, labels)
    if not aligned_correctly:
        raise ClustalFormatError("Sequences not aligned properly")
    alns = []
    for key in labels:
        alns.append(Sequence(sequence=''.join(data[key]),
                             metadata={'id': key}))
    return Alignment(alns)
コード例 #11
0
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
コード例 #12
0
ファイル: fastq.py プロジェクト: squarednob/scikit-bio
def _fastq_to_alignment(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=BiologicalSequence):
    return Alignment(
        list(
            _fastq_to_generator(fh,
                                variant=variant,
                                phred_offset=phred_offset,
                                constructor=constructor)))
コード例 #13
0
ファイル: fastq.py プロジェクト: liupfskygre/scikit-bio
def _fastq_to_alignment(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=Sequence,
                        **kwargs):
    return Alignment(
        list(
            _fastq_to_generator(fh,
                                variant=variant,
                                phred_offset=phred_offset,
                                constructor=constructor,
                                **kwargs)))
コード例 #14
0
    def test_call_pynast_test1_alt_min_len(self):
        """PyNastAligner: returns no result when min_len too high
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 1000
        })

        actual_aln = aligner(self.pynast_test1_input_fp)
        expected_aln = Alignment([])

        self.assertEqual(actual_aln, expected_aln)
コード例 #15
0
ファイル: filter_alignment.py プロジェクト: DSWallach/qiime
def apply_lane_mask_and_gap_filter(fastalines,
                                   mask,
                                   allowed_gap_frac=1. - 1e-6,
                                   entropy_threshold=None):
    """Applies a mask and gap filter to fasta file, yielding filtered seqs."""

    # load the alignment
    aln = Alignment.from_fasta_records(parse_fasta(fastalines), DNA)

    # build the entropy mask
    if mask is None and entropy_threshold is None:
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
    elif mask is not None and entropy_threshold is not None:
        raise ValueError('only mask or entropy threshold can be provided.')
    elif mask is not None:
        # a pre-computed mask (e.g., Lane mask) was provided, so apply that
        # and then remove highly gapped positions (gap positions have to be
        # removed after the mask-based filtering so that the positions in the
        # mask correspond with the positions in the alignment at the time of
        # filtering)
        entropy_mask = mask_to_positions(mask)
        aln = aln.subalignment(positions_to_keep=entropy_mask)
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
    elif entropy_threshold is not None:
        # a mask is being computed on the fly to filter the entropy_threshold
        # most entropic positions. if highly gapped positions are being omitted
        # those are filtered first, so the entropy scores for those positions
        # aren't included when determining the entropy threshold (since the
        # positions that are mostly gaps will be counted as a lot of low
        # entropy positions)
        if not (0 <= entropy_threshold <= 1):
            raise ValueError('entropy_threshold needs to be between 0 and 1'
                             ' (inclusive)')
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
        entropy_mask = generate_lane_mask(aln, entropy_threshold)
        entropy_mask = mask_to_positions(entropy_mask)
        aln = aln.subalignment(positions_to_keep=entropy_mask)
    else:
        # it shouldn't be possible to get here
        raise ValueError("Can't resolve parameters for "
                         "apply_lane_mask_and_gap_filter.")

    if aln.sequence_length() == 0:
        raise ValueError("Positional filtering resulted in removal of all "
                         "alignment positions.")
    for seq in aln:
        yield ">%s\n" % seq.id
        yield "%s\n" % seq
コード例 #16
0
def apply_lane_mask_and_gap_filter(fastalines, mask,
                                   allowed_gap_frac=1.-1e-6,
                                   entropy_threshold=None):
    """Applies a mask and gap filter to fasta file, yielding filtered seqs."""

    # load the alignment
    aln = Alignment.from_fasta_records(parse_fasta(fastalines), DNA)

    # build the entropy mask
    if mask is None and entropy_threshold is None:
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
    elif mask is not None and entropy_threshold is not None:
        raise ValueError('only mask or entropy threshold can be provided.')
    elif mask is not None:
        # a pre-computed mask (e.g., Lane mask) was provided, so apply that
        # and then remove highly gapped positions (gap positions have to be
        # removed after the mask-based filtering so that the positions in the
        # mask correspond with the positions in the alignment at the time of
        # filtering)
        entropy_mask = mask_to_positions(mask)
        aln = aln.subalignment(positions_to_keep=entropy_mask)
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
    elif entropy_threshold is not None:
        # a mask is being computed on the fly to filter the entropy_threshold
        # most entropic positions. if highly gapped positions are being omitted
        # those are filtered first, so the entropy scores for those positions
        # aren't included when determining the entropy threshold (since the
        # positions that are mostly gaps will be counted as a lot of low
        # entropy positions)
        if not (0 <= entropy_threshold <= 1):
            raise ValueError('entropy_threshold needs to be between 0 and 1'
                             ' (inclusive)')
        if allowed_gap_frac < 1:
            aln = aln.omit_gap_positions(allowed_gap_frac)
        entropy_mask = generate_lane_mask(aln, entropy_threshold)
        entropy_mask = mask_to_positions(entropy_mask)
        aln = aln.subalignment(positions_to_keep=entropy_mask)
    else:
        # it shouldn't be possible to get here
        raise ValueError("Can't resolve parameters for "
                         "apply_lane_mask_and_gap_filter.")

    if aln.sequence_length() == 0:
        raise ValueError("Positional filtering resulted in removal of all "
                         "alignment positions.")
    for seq in aln:
        yield ">%s\n" % seq.id
        yield "%s\n" % seq
コード例 #17
0
    def test_call_infernal_test1_file_output(self):
        """InfernalAligner writes correct output files for infernal_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.infernal_test1_aligner(self.infernal_test1_input_fp,
                                             result_path=self.result_fp,
                                             log_path=self.log_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.infernal_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(result_f),
                                                      DNA)
        self.assertEqual(actual_aln, expected_aln)
コード例 #18
0
ファイル: test_align_seqs.py プロジェクト: Springbudder/qiime
    def test_call_infernal_test1_file_output(self):
        """InfernalAligner writes correct output files for infernal_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.infernal_test1_aligner(
            self.infernal_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.infernal_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)
コード例 #19
0
ファイル: test_align_seqs.py プロジェクト: shiffer1/qiime
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
コード例 #20
0
ファイル: test_align_seqs.py プロジェクト: Springbudder/qiime
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
コード例 #21
0
    def setUp(self):
        fd, self.infernal_test1_input_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.infernal_test1_input_fp, 'w') as in_f:
            in_f.write('\n'.join(infernal_test1_input_fasta))

        fd, self.infernal_test1_template_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='template.sto')
        close(fd)
        with open(self.infernal_test1_template_fp, 'w') as in_f:
            in_f.write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix='InfernalAlignerTests_',
                                     suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()

        fd, self.log_fp = mkstemp(prefix='InfernalAlignerTests_',
                                  suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({
            'template_filepath':
            self.infernal_test1_template_fp,
        })
        self.infernal_test1_expected_aln = Alignment.from_fasta_records(
            parse_fasta(infernal_test1_expected_alignment), DNA)
コード例 #22
0
ファイル: test_align_seqs.py プロジェクト: Springbudder/qiime
    def setUp(self):
        fd, self.infernal_test1_input_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.infernal_test1_input_fp, 'w') as in_f:
            in_f.write('\n'.join(infernal_test1_input_fasta))

        fd, self.infernal_test1_template_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='template.sto')
        close(fd)
        with open(self.infernal_test1_template_fp, 'w') as in_f:
            in_f.write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()

        fd, self.log_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({
            'template_filepath': self.infernal_test1_template_fp,
        })
        self.infernal_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(infernal_test1_expected_alignment),
                DNA)
コード例 #23
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(prefix='PyNastAlignerTests_',
                                                 suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix='PyNastAlignerTests_',
                                     suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(prefix='PyNastAlignerTests_',
                                      suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp, self.result_fp, self.failure_fp,
            self.log_fp, self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
            parse_fasta(pynast_test1_expected_alignment), DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
            parse_fasta(pynast_test1_expected_failure), DNA)
コード例 #24
0
ファイル: fasta.py プロジェクト: squarednob/scikit-bio
def _fasta_to_alignment(fh, qual=FileSentinel, constructor=BiologicalSequence):
    return Alignment(
        list(_fasta_to_generator(fh, qual=qual, constructor=constructor)))
コード例 #25
0
ファイル: test_align_seqs.py プロジェクト: Springbudder/qiime
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(pynast_test1_expected_alignment),
                    DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
                parse_fasta(pynast_test1_expected_failure), DNA)
コード例 #26
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the ids to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}
コード例 #27
0
ファイル: phylip.py プロジェクト: jdrudolph/scikit-bio
def _phylip_to_alignment(fh, constructor=Sequence):
    return Alignment([constructor(seq, metadata={'id': ID})
                      for (seq, ID) in _parse_phylip_raw(fh)])
コード例 #28
0
ファイル: _pairwise.py プロジェクト: liupfskygre/scikit-bio
def local_pairwise_align_ssw(sequence1,
                             sequence2,
                             constructor=Sequence,
                             **kwargs):
    """Align query and target sequences with Striped Smith-Waterman.

    Parameters
    ----------
    sequence1 : str or Sequence
        The first unaligned sequence
    sequence2 : str or Sequence
        The second unaligned sequence
    constructor : Sequence subclass
        A constructor to use if `protein` is not True.

    Returns
    -------
    ``skbio.alignment.Alignment``
        The resulting alignment as an Alignment object

    Notes
    -----
    This is a wrapper for the SSW package [1]_.

    For a complete list of optional keyword-arguments that can be provided,
    see ``skbio.alignment.StripedSmithWaterman``.

    The following kwargs will not have any effect: `suppress_sequences` and
    `zero_index`

    If an alignment does not meet a provided filter, `None` will be returned.

    References
    ----------
    .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T.
       Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for
       Applications". PLOS ONE (2013). Web. 11 July 2014.
       http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138

    See Also
    --------
    skbio.alignment.StripedSmithWaterman

    """
    # We need the sequences for `Alignment` to make sense, so don't let the
    # user suppress them.
    kwargs['suppress_sequences'] = False
    kwargs['zero_index'] = True

    if isinstance(sequence1, Protein):
        kwargs['protein'] = True

    query = StripedSmithWaterman(str(sequence1), **kwargs)
    alignment = query(str(sequence2))

    # If there is no cigar, then it has failed a filter. Return None.
    if not alignment.cigar:
        return None

    start_end = None
    if alignment.query_begin != -1:
        start_end = [(alignment.query_begin, alignment.query_end),
                     (alignment.target_begin, alignment.target_end_optimal)]
    if kwargs.get('protein', False):
        seqs = [
            Protein(alignment.aligned_query_sequence, metadata={'id':
                                                                'query'}),
            Protein(alignment.aligned_target_sequence,
                    metadata={'id': 'target'})
        ]
    else:
        seqs = [
            constructor(alignment.aligned_query_sequence,
                        metadata={'id': 'query'}),
            constructor(alignment.aligned_target_sequence,
                        metadata={'id': 'target'})
        ]

    return Alignment(seqs,
                     score=alignment.optimal_alignment_score,
                     start_end_positions=start_end)
コード例 #29
0
def local_pairwise_align(seq1, seq2, gap_open_penalty,
                         gap_extend_penalty, substitution_matrix):
    """Locally align exactly two seqs with Smith-Waterman

    Parameters
    ----------
    seq1 : str or BiologicalSequence
        The first unaligned sequence.
    seq2 : str or BiologicalSequence
        The second unaligned sequence.
    gap_open_penalty : int or float
        Penalty for opening a gap (this is substracted from previous best
        alignment score, so is typically positive).
    gap_extend_penalty : int or float
        Penalty for extending a gap (this is substracted from previous best
        alignment score, so is typically positive).
    substitution_matrix: 2D dict (or similar)
        Lookup for substitution scores (these values are added to the
        previous best alignment score).

    Returns
    -------
    skbio.Alignment
       ``Alignment`` object containing the aligned sequences as well as
        details about the alignment.

    See Also
    --------
    local_pairwise_align_protein
    local_pairwise_align_nucleotide
    skbio.alignment.local_pairwise_align_ssw
    global_pairwise_align
    global_pairwise_align_protein
    global_pairwise_align_nucelotide

    Notes
    -----
    This algorithm was originally described in [1]_. The scikit-bio
    implementation was validated against the EMBOSS water web server [2]_.

    References
    ----------
    .. [1] Identification of common molecular subsequences.
       Smith TF, Waterman MS.
       J Mol Biol. 1981 Mar 25;147(1):195-7.
    .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_water/

    """
    warn("You're using skbio's python implementation of Smith-Waterman "
         "alignment. This will be very slow (e.g., thousands of times slower) "
         "than skbio.alignment.local_pairwise_align_ssw.",
         EfficiencyWarning)

    seq1 = _coerce_alignment_input_type(seq1, disallow_alignment=True)
    seq2 = _coerce_alignment_input_type(seq2, disallow_alignment=True)

    score_matrix, traceback_matrix = _compute_score_and_traceback_matrices(
        seq1, seq2, gap_open_penalty, gap_extend_penalty,
        substitution_matrix, new_alignment_score=0.0,
        init_matrices_f=_init_matrices_sw)

    end_row_position, end_col_position =\
        np.unravel_index(np.argmax(score_matrix), score_matrix.shape)

    aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
        _traceback(traceback_matrix, score_matrix, seq1, seq2,
                   end_row_position, end_col_position)
    start_end_positions = [(seq1_start_position, end_col_position-1),
                           (seq2_start_position, end_row_position-1)]

    return Alignment(aligned1 + aligned2, score=score,
                     start_end_positions=start_end_positions)
コード例 #30
0
def global_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty,
                          substitution_matrix, penalize_terminal_gaps=False):
    """Globally align a pair of seqs or alignments with Needleman-Wunsch

    Parameters
    ----------
    seq1 : str, BiologicalSequence, or Alignment
        The first unaligned sequence(s).
    seq2 : str, BiologicalSequence, or Alignment
        The second unaligned sequence(s).
    gap_open_penalty : int or float
        Penalty for opening a gap (this is substracted from previous best
        alignment score, so is typically positive).
    gap_extend_penalty : int or float
        Penalty for extending a gap (this is substracted from previous best
        alignment score, so is typically positive).
    substitution_matrix: 2D dict (or similar)
        Lookup for substitution scores (these values are added to the
        previous best alignment score).
    penalize_terminal_gaps: bool, optional
        If True, will continue to penalize gaps even after one sequence has
        been aligned through its end. This behavior is true Needleman-Wunsch
        alignment, but results in (biologically irrelevant) artifacts when
        the sequences being aligned are of different length. This is ``False``
        by default, which is very likely to be the behavior you want in all or
        nearly all cases.

    Returns
    -------
    skbio.Alignment
        ``Alignment`` object containing the aligned sequences as well as
        details about the alignment.

    See Also
    --------
    local_pairwise_align
    local_pairwise_align_protein
    local_pairwise_align_nucleotide
    skbio.alignment.local_pairwise_align_ssw
    global_pairwise_align_protein
    global_pairwise_align_nucelotide

    Notes
    -----
    This algorithm (in a slightly more basic form) was originally described
    in [1]_. The scikit-bio implementation was validated against the
    EMBOSS needle web server [2]_.

    This function can be use to align either a pair of sequences, a pair of
    alignments, or a sequence and an alignment.

    References
    ----------
    .. [1] A general method applicable to the search for similarities in
       the amino acid sequence of two proteins.
       Needleman SB, Wunsch CD.
       J Mol Biol. 1970 Mar;48(3):443-53.
    .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_needle/

    """
    warn("You're using skbio's python implementation of Needleman-Wunsch "
         "alignment. This is known to be very slow (e.g., thousands of times "
         "slower than a native C implementation). We'll be adding a faster "
         "version soon (see https://github.com/biocore/scikit-bio/issues/254 "
         "to track progress on this).", EfficiencyWarning)

    seq1 = _coerce_alignment_input_type(seq1, disallow_alignment=False)
    seq2 = _coerce_alignment_input_type(seq2, disallow_alignment=False)

    if penalize_terminal_gaps:
        init_matrices_f = _init_matrices_nw
    else:
        init_matrices_f = _init_matrices_nw_no_terminal_gap_penalty

    score_matrix, traceback_matrix = \
        _compute_score_and_traceback_matrices(
            seq1, seq2, gap_open_penalty, gap_extend_penalty,
            substitution_matrix, new_alignment_score=-np.inf,
            init_matrices_f=init_matrices_f,
            penalize_terminal_gaps=penalize_terminal_gaps)

    end_row_position = traceback_matrix.shape[0] - 1
    end_col_position = traceback_matrix.shape[1] - 1

    aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
        _traceback(traceback_matrix, score_matrix, seq1, seq2,
                   end_row_position, end_col_position)
    start_end_positions = [(seq1_start_position, end_col_position-1),
                           (seq2_start_position, end_row_position-1)]

    return Alignment(aligned1 + aligned2, score=score,
                     start_end_positions=start_end_positions)
コード例 #31
0
ファイル: align_seqs.py プロジェクト: Kleptobismol/qiime
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the ids to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}