Esempio n. 1
0
    def test_aligned_dna_fasta_format_validate_positive(self):
        filepath = self.get_data_path('aligned-dna-sequences.fasta')
        format = AlignedDNAFASTAFormat(filepath, mode='r')

        format.validate()
Esempio n. 2
0
    def test_aligned_dna_fasta_format_validate_negative(self):
        filepath = self.get_data_path('not-dna-sequences')
        format = AlignedDNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValueError, 'AlignedDNAFASTA'):
            format.validate()
Esempio n. 3
0
    def test_aligned_dna_fasta_format_validate_positive(self):
        filepath = self.get_data_path('aligned-dna-sequences.fasta')
        format = AlignedDNAFASTAFormat(filepath, mode='r')

        format.validate()
Esempio n. 4
0
    def test_aligned_dna_fasta_format_validate_negative(self):
        filepath = self.get_data_path('not-dna-sequences')
        format = AlignedDNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, 'AlignedDNAFASTA'):
            format.validate()
Esempio n. 5
0
def _dna_iterator_to_aligned_fasta(iterator):
    ff = AlignedDNAFASTAFormat()
    skbio.io.write(iter(iterator), format='fasta', into=str(ff))
    return ff
Esempio n. 6
0
def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
    # Save original sequence IDs since long ids (~250 chars) can be truncated
    # by mafft. We'll replace the IDs in the aligned sequences file output by
    # mafft with the originals.
    #
    # https://github.com/qiime2/q2-alignment/issues/37
    aligned_seq_ids = {}
    unaligned_seq_ids = {}

    if alignment_fp is not None:
        for seq in skbio.io.read(alignment_fp, format='fasta',
                                 constructor=skbio.DNA):
            id_ = seq.metadata['id']
            if id_ in aligned_seq_ids:
                raise ValueError(
                    "A sequence ID is duplicated in the aligned sequences: "
                    "%r" % id_)
            else:
                aligned_seq_ids[id_] = True

    for seq in skbio.io.read(sequences_fp, format='fasta',
                             constructor=skbio.DNA):
        id_ = seq.metadata['id']
        if id_ in unaligned_seq_ids:
            raise ValueError(
                "A sequence ID is duplicated in the unaligned sequences: "
                "%r" % id_)
        elif id_ in aligned_seq_ids:
            raise ValueError(
                "A sequence ID is present in both the aligned and unaligned "
                "sequences: %r" % id_)
        else:
            unaligned_seq_ids[id_] = True

    result = AlignedDNAFASTAFormat()
    result_fp = str(result)
    ids = {**aligned_seq_ids, **unaligned_seq_ids}

    # mafft will fail if the number of sequences is larger than 1 million.
    # mafft requires using parttree which is an algorithm to build an
    # approximate tree from a large number of unaligned sequences.
    # By catching the error below if a user has not used parttree flag, we are
    # eliminating the need for the mafft error to be shown to the user which
    # can be confusing and intimidating.

    if not parttree and len(ids) > 1000000:
        raise ValueError(
            "The number of sequences in your feature table is larger than "
            "1 million, please use the parttree parameter")

    # mafft's signal for utilizing all cores is -1. We want to our users
    # to enter auto for using all cores. This is to prevent any confusion and
    # to keep the UX consisent.
    if n_threads == 'auto':
        n_threads = -1

    # `--inputorder` must be turned on because we need the input and output in
    # the same sequence order to replace the IDs below. This is mafft's default
    # behavior but we pass the flag in case that changes in the future.
    cmd = ["mafft", "--preservecase", "--inputorder",
           "--thread", str(n_threads)]

    if parttree:
        cmd += ['--parttree']

    if alignment_fp is not None:
        add_flag = '--addfragments' if addfragments else '--add'
        cmd += [add_flag, sequences_fp, alignment_fp]
    else:
        cmd += [sequences_fp]

    run_command(cmd, result_fp)

    # Read output alignment into memory, reassign original sequence IDs, and
    # write alignment back to disk.
    msa = skbio.TabularMSA.read(result_fp, format='fasta',
                                constructor=skbio.DNA)
    # Using `assert` because mafft would have had to add or drop sequences
    # while aligning, which would be a bug on mafft's end. This is just a
    # sanity check and is not expected to trigger in practice.
    assert len(ids) == len(msa)
    for id, seq in zip(ids, msa):
        seq.metadata['id'] = id

    # Turning off roundtripping options to speed up writing. We can safely turn
    # these options off because we know the sequence IDs are rountrip-safe
    # since we read them from a FASTA file above.
    #
    # http://scikit-bio.org/docs/latest/generated/
    #     skbio.io.format.fasta.html#writer-specific-parameters
    msa.write(result_fp, id_whitespace_replacement=None,
              description_newline_replacement=None)
    return result
 def setUp(self):
     super().setUp()
     self.aligned_dna_path = pkg_resources.resource_filename(
         'rescript.tests', 'data/trim-test-alignment.fasta')
     self.aligned_dna_seqs = AlignedDNAFASTAFormat(
         self.aligned_dna_path, mode='r').view(AlignedDNAIterator)
Esempio n. 8
0
 def setUp(self):
     super().setUp()
     input_fp = self.get_data_path('degap-test-alignment.fasta')
     self.alignedseqs = AlignedDNAFASTAFormat(
         input_fp, mode='r').view(AlignedDNAIterator)
    def setUp(self):
        super().setUp()

        aligned_seqs_fp = self.get_data_path('trim-test-alignment.fasta')
        aligned_with_primers_fp = self.get_data_path(
            'trim-test-alignment-with-primers.fasta')

        self.aligned_seqs = qiime2.Artifact.import_data(
            'FeatureData[AlignedSequence]', aligned_seqs_fp)
        self.aligned_seqs_fasta = AlignedDNAFASTAFormat(aligned_seqs_fp,
                                                        mode='r')
        self.primers_dict = {
            "forward": "GGGAATCTTCCACAATGG",
            "reverse": "GTGTTCTTCTCTAACAACAG"
        }
        self.aligned_with_primers = qiime2.Artifact.import_data(
            'FeatureData[AlignedSequence]', aligned_with_primers_fp)
        self.aligned_with_primers_fasta = AlignedDNAFASTAFormat(
            aligned_with_primers_fp, mode='r')
        self.aligned_mess_fasta = AlignedDNAFASTAFormat(
            self.get_data_path('trim-test-alignment-with-primers-mess.fasta'),
            mode='r')
        self.aligned_with_fwd_fasta = AlignedDNAFASTAFormat(
            self.get_data_path('trim-test-alignment-fwd.fasta'), mode='r')
        self.aligned_with_rev_fasta = AlignedDNAFASTAFormat(
            self.get_data_path('trim-test-alignment-rev.fasta'), mode='r')
        self.trimmed_fasta = AlignedDNAFASTAFormat(
            self.get_data_path('trim-test-sequences-trimmed.fasta'), mode='r')

        self.fake_ctx = FakeCtx({
            1: self.aligned_with_primers_fasta,
            2: self.aligned_with_fwd_fasta,
            3: self.aligned_with_rev_fasta
        })

        self.exp_seqs_both_primers = {
            's1': ('GGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGCCGCGTGAG'
                   'TGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAGAAGAACAC'),
            's2': ('GGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGG'
                   'GGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGGACGAAGCG'),
            's3': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG'
                   'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAG'),
            's4': ('GGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGCCGCGTGCG'
                   'GGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGGAAGAAATC'),
            's5': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG'
                   'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAG'),
        }
        self.exp_seqs_only_fwd = {
            's1': ('GGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGCCGCGTGAG'
                   'TGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAGAAGAACACG'
                   'TGCTAGG--------'),
            's2': ('GGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGG'
                   'GGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGGACGAAGCGT'
                   'TTTG-----------'),
            's3': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG'
                   'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAGC'
                   'TTATGGTTAAAAAAA'),
            's4': ('GGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGCCGCGTGCG'
                   'GGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGGAAGAAATCC'
                   'TCTGGGCTAAAAAAA'),
            's5': ('GGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTG'
                   'TGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGGAGGAAAAGC'
                   'TTGTGGTTAA-----'),
        }
        self.exp_seqs_only_rev = {
            's1': ('-----TAGGGAATCTTCCACAATGGGTGCAAACCTGATGGAGCAATGC'
                   'CGCGTGAGTGAAGANAGGTCTTCGGATCGTAAAGCTCTGTTGTTAGAG'
                   'AAGAACAC'),
            's2': ('AATTTTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGC'
                   'CGCGTGGGGGATGA-CGGCCTTCGGGTTGTAAACTCCTTTCGCCAGGG'
                   'ACGAAGCG'),
            's3': ('-----TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGC'
                   'CGCGTGTGTGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGG'
                   'AGGAAAAG'),
            's4': ('-----TGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCCATGC'
                   'CGCGTGCGGGAAGANAGGCCTTCGGGTTGTAAACCGCTTTTGTTCGGG'
                   'AAGAAATC'),
            's5': ('---AATGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGC'
                   'CGCGTGTGTGAAGA-AGGCCTTTTGGTTGTAAAGCACTTTAAGTGGGG'
                   'AGGAAAAG'),
        }