Beispiel #1
0
    def test_basic(self):
        alignment = skbio.TabularMSA([
            skbio.DNA('AGA', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('-GA', metadata={
                'id': 'seq2',
                'description': ''
            }),
            skbio.DNA('-GC', metadata={
                'id': 'seq3',
                'description': ''
            })
        ])

        actual = mask(alignment, max_gap_frequency=0.05, min_conservation=0.30)

        expected = skbio.TabularMSA([
            skbio.DNA('GA', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('GA', metadata={
                'id': 'seq2',
                'description': ''
            }),
            skbio.DNA('GC', metadata={
                'id': 'seq3',
                'description': ''
            })
        ])

        self.assertEqual(actual, expected)
Beispiel #2
0
    def test_apply_mask_mask_some(self):
        obs = _apply_mask(self.msa1, np.array([False, True, True, True]))
        seqs = [
            skbio.DNA('A', metadata=dict(id='s1')),
            skbio.DNA('A', metadata=dict(id='s2')),
            skbio.DNA('-', metadata=dict(id='s3'))
        ]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)

        obs = _apply_mask(self.msa1, np.array([False, True, True, False]))
        seqs = [
            skbio.DNA('AT', metadata=dict(id='s1')),
            skbio.DNA('AT', metadata=dict(id='s2')),
            skbio.DNA('-T', metadata=dict(id='s3'))
        ]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)

        obs = _apply_mask(self.msa1, np.array([False, True, False, False]))
        seqs = [
            skbio.DNA('AGT', metadata=dict(id='s1')),
            skbio.DNA('A-T', metadata=dict(id='s2')),
            skbio.DNA('--T', metadata=dict(id='s3'))
        ]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)
Beispiel #3
0
    def test_conservation_boundaries(self):
        alignment1 = skbio.TabularMSA([
            skbio.DNA('A', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('A', metadata={
                'id': 'seq2',
                'description': ''
            }),
            skbio.DNA('A', metadata={
                'id': 'seq3',
                'description': ''
            })
        ])
        alignment2 = skbio.TabularMSA([
            skbio.DNA('-', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('-', metadata={
                'id': 'seq2',
                'description': ''
            }),
            skbio.DNA('-', metadata={
                'id': 'seq3',
                'description': ''
            })
        ])

        actual = mask(alignment1, max_gap_frequency=1.0, min_conservation=1.0)
        self.assertEqual(actual, alignment1)

        actual = mask(alignment2, max_gap_frequency=1.0, min_conservation=0.0)
        self.assertEqual(actual, alignment2)
Beispiel #4
0
    def test_empty_input(self):
        alignment = skbio.TabularMSA(
            [skbio.DNA('', metadata={'id': 'seq1', 'description': ''}),
             skbio.DNA('', metadata={'id': 'seq2', 'description': ''}),
             skbio.DNA('', metadata={'id': 'seq3', 'description': ''})]
            )
        with self.assertRaises(ValueError):
            mask(alignment)

        alignment = skbio.TabularMSA([])
        with self.assertRaises(ValueError):
            mask(alignment)
Beispiel #5
0
def trim(OGid):
    # 0 Load MSA
    try:
        msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa')
    except FileNotFoundError:
        msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa')

    # 1 Calculate shared variables
    gaps_array = np.full((len(msa1), len(msa1[0][1])), False)
    for i, (_, seq) in enumerate(msa1):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)
    msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1])

    # 2 Get trims (segments and columns)
    syms_list1 = trim_conserved(msa1, scores, gaps_array,
                                tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig'])
    syms_list2, trims = trim_insertions(msa1, scores, gaps_array,
                                        tp['gap_num'], tp['gap_rate'], tp['gap_minsig'],
                                        tp['nongap_frac'], tp['nongap_minlen'],
                                        tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'],
                                        tp['weights'], tp['threshold'],
                                        matrix)

    # 3 Combine trims (segments and columns) to yield final alignment
    msa2 = []
    for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2):
        syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)]  # Will only differ if one is converted to gap
        msa2.append((seq.metadata['description'], syms))

    # 4 Restore gap only columns
    gaps_array = np.full((len(msa2), len(msa2[0][1])), False)
    for i, (_, seq) in enumerate(msa2):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)

    rf = ['x' for _ in range(len(msa2[0][1]))]  # Metadata for marking consensus columns in profile HMM
    for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]):
        rf[region] = (region.stop - region.start) * ['.']
        for i in range(len(msa2)):
            syms = msa2[i][1]
            syms[region] = list(str(msa1[i, region]))

    # 5 Write to file
    msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2],
                            positional_metadata={'RF': rf})
    msa2.write(f'out/{OGid}.sto', 'stockholm')
Beispiel #6
0
    def test_create_terminal_gap_mask_all(self):
        seqs = [
            skbio.DNA('----', metadata=dict(id='s1')),
            skbio.DNA('AGAT', metadata=dict(id='s2')),
            skbio.DNA('----', metadata=dict(id='s3'))]
        msa = skbio.TabularMSA(seqs, minter='id')
        obs = _create_terminal_gap_mask(msa, self.mask2)
        npt.assert_array_equal(obs, [True, True, True, True])

        seqs = [
            skbio.DNA('ACG-', metadata=dict(id='s1')),
            skbio.DNA('AG-T', metadata=dict(id='s2')),
            skbio.DNA('----', metadata=dict(id='s3'))]
        msa = skbio.TabularMSA(seqs, minter='id')
        obs = _create_terminal_gap_mask(msa, self.mask5)
        npt.assert_array_equal(obs, [True, True, True, True])
Beispiel #7
0
    def _prepare_sequence_data(self):
        sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
        sequences = DNAFASTAFormat(sequences_fp, mode='r')
        alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
        alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
        exp = skbio.TabularMSA([
            skbio.DNA('AGGGGG-',
                      metadata={
                          'id': 'aln-seq-1',
                          'description': ''
                      }),
            skbio.DNA('AGGGGGG',
                      metadata={
                          'id': 'aln-seq-2',
                          'description': ''
                      }),
            skbio.DNA('AGGGGGG', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('-GGGGGG', metadata={
                'id': 'seq2',
                'description': ''
            })
        ])

        return alignment, sequences, exp
Beispiel #8
0
    def test_mask2_wo_terminal_gap_mask(self):
        obs = mask(self.msa1, self.mask2, "mask", False)
        seqs = [
            skbio.DNA('ACG', metadata=dict(id='s1')),
            skbio.DNA('AG-', metadata=dict(id='s2')),
            skbio.DNA('-C-', metadata=dict(id='s3'))]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)

        obs = mask(self.msa1, self.mask2, "caution", False)
        seqs = [
            skbio.DNA('AG', metadata=dict(id='s1')),
            skbio.DNA('A-', metadata=dict(id='s2')),
            skbio.DNA('--', metadata=dict(id='s3'))]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)
Beispiel #9
0
 def test_apply_mask_mask_all(self):
     obs = _apply_mask(self.msa1, np.array([True, True, True, True]))
     seqs = [
         skbio.DNA('', metadata=dict(id='s1')),
         skbio.DNA('', metadata=dict(id='s2')),
         skbio.DNA('', metadata=dict(id='s3'))]
     exp = skbio.TabularMSA(seqs, minter='id')
     self.assertEqual(obs, exp)
Beispiel #10
0
    def _prepare_sequence_data(self):
        input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
        input_sequences = DNAFASTAFormat(input_fp, mode='r')
        exp = skbio.TabularMSA(
            [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}),
             skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})]
        )

        return input_sequences, exp
Beispiel #11
0
 def test_reconstruct_fragment_rep_seqs(self):
     recon_map = Artifact.import_data(
         'FeatureData[SidleReconstruction]', 
         pd.DataFrame(data=[['seq01|seq02'], 
                            ['seq01|seq02'], 
                            ['seq03|seq04'], 
                            ['seq03|seq04'], 
                            ['seq05']],
                   index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 
                                   'seq05'], name='db-seq'),
                   columns=['clean_name'])
         )
     recon_summary = Artifact.import_data(
         'FeatureData[ReconstructionSummary]',
         Metadata(pd.DataFrame(data=[[1, 2, 2, 0, 'asv01|asv02'],
                                     [2, 3, 1.5, np.std([1, 2], ddof=1), 
                                      'asv03|asv04'],
                                     [2, 2, 1, 0, 'asv07|asv08']],
                              index=pd.Index(['seq01|seq02', 'seq03|seq04', 
                                              'seq05'], name='feature-id'),
                             columns=['num-regions', 'total-kmers-mapped', 
                                      'mean-kmer-per-region', 
                                      'stdv-kmer-per-region', 
                                      'mapped-asvs']))
     )
     aligned_seqs = Artifact.import_data(
         'FeatureData[AlignedSequence]', 
         skbio.TabularMSA([
             DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------', metadata={'id': 'seq01'}),
             DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------', metadata={'id': 'seq02'}),
             DNA('CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGC-', metadata={'id': 'seq03'}),
             DNA('------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGCC', metadata={'id': 'seq04'}),
             DNA('CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA'
                 'GCCACTGACGTGCG', metadata={'id': 'seq05'}),
         ])
     )
     known = pd.Series(
         data=['GCGAAGCGGCTCAGG',
               'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC'],
         index=pd.Index(['seq01|seq02', 'seq03|seq04']),
         )
     test = sidle.reconstruct_fragment_rep_seqs(
         region=['Bludhaven', 'Gotham'],
         kmer_map=[Artifact.load(os.path.join(self.base_dir, 
                                 'frag_r1_db_map.qza')),
                   Artifact.load(os.path.join(self.base_dir, 
                                 'frag_r2_db_map.qza'))],
         reconstruction_map=recon_map, 
         reconstruction_summary=recon_summary, 
         aligned_sequences=aligned_seqs,
         ).representative_fragments
     pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
Beispiel #12
0
    def test_error_on_empty_alignment_conservation_boundary(self):
        alignment1 = skbio.TabularMSA(
            [skbio.DNA('A', metadata={'id': 'seq1', 'description': ''}),
             skbio.DNA('C', metadata={'id': 'seq2', 'description': ''}),
             skbio.DNA('G', metadata={'id': 'seq3', 'description': ''})])

        self.assertRaisesRegex(ValueError,
                               " 0.00% of positions were retained by the con",
                               mask, alignment1, max_gap_frequency=1.0,
                               min_conservation=0.5)
Beispiel #13
0
    def test_create_terminal_gap_mask_two_chrome(self):
        obs = _create_terminal_gap_mask(self.msa1, self.mask2)
        npt.assert_array_equal(obs, [False, False, False, False])

        seqs = [
            skbio.DNA('-CGT', metadata=dict(id='s1')),
            skbio.DNA('AG-T', metadata=dict(id='s2')),
            skbio.DNA('-C-T', metadata=dict(id='s3'))]
        msa = skbio.TabularMSA(seqs, minter='id')
        obs = _create_terminal_gap_mask(msa, self.mask2)
        npt.assert_array_equal(obs, [True, False, False, False])

        seqs = [
            skbio.DNA('-CG-', metadata=dict(id='s1')),
            skbio.DNA('AG-T', metadata=dict(id='s2')),
            skbio.DNA('-C--', metadata=dict(id='s3'))]
        msa = skbio.TabularMSA(seqs, minter='id')
        obs = _create_terminal_gap_mask(msa, self.mask2)
        npt.assert_array_equal(obs, [True, False, False, True])
Beispiel #14
0
    def test_create_position_map_all_gaps(self):
        seqs = [
            skbio.DNA('ACGT', metadata=dict(id='s1')),
            skbio.DNA('AG-T', metadata=dict(id='s2')),
            skbio.DNA('----', metadata=dict(id='s3'))]
        msa = skbio.TabularMSA(seqs, minter='id')

        obs = _create_position_map(msa, 's3')
        exp = np.array([])
        npt.assert_array_equal(obs, exp)
Beispiel #15
0
 def test_invalid_conservation_threshold(self):
     alignment = skbio.TabularMSA(
         [skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}),
          skbio.DNA('-', metadata={'id': 'seq2', 'description': ''}),
          skbio.DNA('-', metadata={'id': 'seq3', 'description': ''})]
     )
     eps = np.finfo(float).eps
     with self.assertRaises(ValueError):
         mask(alignment, min_conservation=0.0 - eps)
     with self.assertRaises(ValueError):
         mask(alignment, min_conservation=1.0 + eps)
Beispiel #16
0
    def setUp(self):
        super().setUp()
        _, self.mask1 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask1.vcf')
        _, self.mask2 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask2.vcf')
        _, self.mask3 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask3.vcf')
        _, self.mask4 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask4.vcf')
        _, self.mask5 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask5.vcf')
        _, self.mask6 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask6.vcf')
        _, self.mask7 = self.transform_format(VCFMaskFormat,
                                              pd.DataFrame,
                                              filename='mask7.vcf')

        seqs = [
            skbio.DNA('ACGT', metadata=dict(id='s1')),
            skbio.DNA('AG-T', metadata=dict(id='s2')),
            skbio.DNA('-C-T', metadata=dict(id='s3'))
        ]
        self.msa1 = skbio.TabularMSA(seqs, minter='id')

        seqs = [
            skbio.DNA('TCNTGNNNGGTGCCA-CC--AAA--', metadata=dict(id='s1')),
            skbio.DNA('TCNTGCTCGGTGCCA-CC--AAAT-', metadata=dict(id='s2')),
            skbio.DNA('TCNTGCTCGGTACCA-CC--AAA--', metadata=dict(id='s3')),
            skbio.DNA('-CN-GCTCGGTGCCA-CCGGAAACT', metadata=dict(id='S_4')),
            skbio.DNA('TCNTGCTCGGTGCCA-CC--AAATT',
                      metadata=dict(id='seq5.555')),
            skbio.DNA('--NTGCTCGGTGCCA-CC--AAAT-', metadata=dict(id='s11'))
        ]
        self.msa2 = skbio.TabularMSA(seqs, minter='id')
Beispiel #17
0
    def test_mask4_w_terminal_gap_mask(self):
        obs = mask(self.msa2, self.mask4, "mask", True)
        seqs = [
            skbio.DNA('NNNGGTGCCA-CC--A', metadata=dict(id='s1')),
            skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='s2')),
            skbio.DNA('CTCGGTACCA-CC--A', metadata=dict(id='s3')),
            skbio.DNA('CTCGGTGCCA-CCGGA', metadata=dict(id='S_4')),
            skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='seq5.555')),
            skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='s11'))]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)

        obs = mask(self.msa2, self.mask4, "caution", True)
        seqs = [
            skbio.DNA('NNNGGCCA-CC--A', metadata=dict(id='s1')),
            skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s2')),
            skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s3')),
            skbio.DNA('CTCGGCCA-CCGGA', metadata=dict(id='S_4')),
            skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='seq5.555')),
            skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s11'))]
        exp = skbio.TabularMSA(seqs, minter='id')
        self.assertEqual(obs, exp)
Beispiel #18
0
 def test_mafft(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     exp = skbio.TabularMSA([
         skbio.DNA('AGGGGGG', metadata={
             'id': 'seq1',
             'description': ''
         }),
         skbio.DNA('-GGGGGG', metadata={
             'id': 'seq2',
             'description': ''
         })
     ])
     with redirected_stdio(stderr=os.devnull):
         result = mafft(input_sequences)
     obs = skbio.io.read(str(result),
                         into=skbio.TabularMSA,
                         constructor=skbio.DNA)
     self.assertEqual(obs, exp)
Beispiel #19
0
rows = []
for OGid in OGids:
    try:
        msa = read_fasta(f'../align_fastas1/out/{OGid}.mfa')
    except FileNotFoundError:
        msa = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa')

    gaps_array = np.full((len(msa), len(msa[0][1])), False)
    for i, (_, seq) in enumerate(msa):
        for j, sym in enumerate(seq):
            if sym == '-':
                gaps_array[i, j] = True
    scores = gaps_array.sum(axis=0)
    msa = skbio.TabularMSA([
        skbio.Protein(seq, metadata={'description': header})
        for header, seq in msa
    ])

    mask = ndimage.label(len(msa) - scores <= tp['gap_num'])[0]
    regions = [region for region, in ndimage.find_objects(mask)]
    for region in regions:
        for segment in get_segments(msa, region, matrix):
            row = {
                'OGid': OGid,
                'start': segment['region'].start,
                'stop': segment['region'].stop,
                'index': segment['index'],
                'length': sum([s.stop - s.start for s in segment['slices']])
            }
            rows.append(row)
 def test_reconstruct_fragment_rep_seqs(self):
     recon_map = Artifact.import_data(
         'FeatureData[SidleReconstruction]',
         pd.DataFrame(
             data=np.array([
                 ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15],
                 ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15],
                 ['seq03|seq04', 0, 'WANTCAT', 1, 'CACCTCGTN', 15],
                 ['seq03|seq04', 0, 'CACCTCGTN', 1, 'CACCTCGTN', 15],
                 ['seq05', 0, 'WANTCAT', 1, 'CACCTCGTN', 15],
             ],
                           dtype=object),
             index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'],
                            name='db-seq'),
             columns=[
                 'clean_name', 'first-region', 'first-fwd-primer',
                 'last-region', 'last-fwd-primer', 'last-kmer-length'
             ],
         ))
     recon_summary = Artifact.import_data(
         'FeatureData[ReconstructionSummary]',
         Metadata(
             pd.DataFrame(
                 data=[[1, 2, 2, 0, 'asv01|asv02'],
                       [2, 3, 1.5,
                        np.std([1, 2], ddof=1), 'asv03|asv04'],
                       [2, 2, 1, 0, 'asv07|asv08']],
                 index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'],
                                name='feature-id'),
                 columns=[
                     'num-regions', 'total-kmers-mapped',
                     'mean-kmer-per-region', 'stdv-kmer-per-region',
                     'mapped-asvs'
                 ])))
     aligned_seqs = Artifact.import_data(
         'FeatureData[AlignedSequence]',
         skbio.TabularMSA([
             DNA(
                 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------',
                 metadata={'id': 'seq01'}),
             DNA(
                 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------',
                 metadata={'id': 'seq02'}),
             DNA(
                 'CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGC-',
                 metadata={'id': 'seq03'}),
             DNA(
                 '------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGCC',
                 metadata={'id': 'seq04'}),
             DNA(
                 'CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA'
                 'GCCACTGACGTGCG',
                 metadata={'id': 'seq05'}),
         ]))
     known = pd.Series(
         data=[
             'GCGAAGCGGCTCAGG',
             'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC'
         ],
         index=pd.Index(['seq01|seq02', 'seq03|seq04']),
     )
     test = sidle.reconstruct_fragment_rep_seqs(
         reconstruction_map=recon_map,
         reconstruction_summary=recon_summary,
         aligned_sequences=aligned_seqs,
     ).representative_fragments
     pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
Beispiel #21
0
def align2skbio(align):
    return skbio.TabularMSA(
        [Sequence(s, metadata=dict(id=str(i))) for i, s in align.items()])