def test_basic(self): alignment = skbio.TabularMSA([ skbio.DNA('AGA', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-GA', metadata={ 'id': 'seq2', 'description': '' }), skbio.DNA('-GC', metadata={ 'id': 'seq3', 'description': '' }) ]) actual = mask(alignment, max_gap_frequency=0.05, min_conservation=0.30) expected = skbio.TabularMSA([ skbio.DNA('GA', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('GA', metadata={ 'id': 'seq2', 'description': '' }), skbio.DNA('GC', metadata={ 'id': 'seq3', 'description': '' }) ]) self.assertEqual(actual, expected)
def test_apply_mask_mask_some(self): obs = _apply_mask(self.msa1, np.array([False, True, True, True])) seqs = [ skbio.DNA('A', metadata=dict(id='s1')), skbio.DNA('A', metadata=dict(id='s2')), skbio.DNA('-', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = _apply_mask(self.msa1, np.array([False, True, True, False])) seqs = [ skbio.DNA('AT', metadata=dict(id='s1')), skbio.DNA('AT', metadata=dict(id='s2')), skbio.DNA('-T', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = _apply_mask(self.msa1, np.array([False, True, False, False])) seqs = [ skbio.DNA('AGT', metadata=dict(id='s1')), skbio.DNA('A-T', metadata=dict(id='s2')), skbio.DNA('--T', metadata=dict(id='s3')) ] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def test_conservation_boundaries(self): alignment1 = skbio.TabularMSA([ skbio.DNA('A', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('A', metadata={ 'id': 'seq2', 'description': '' }), skbio.DNA('A', metadata={ 'id': 'seq3', 'description': '' }) ]) alignment2 = skbio.TabularMSA([ skbio.DNA('-', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-', metadata={ 'id': 'seq2', 'description': '' }), skbio.DNA('-', metadata={ 'id': 'seq3', 'description': '' }) ]) actual = mask(alignment1, max_gap_frequency=1.0, min_conservation=1.0) self.assertEqual(actual, alignment1) actual = mask(alignment2, max_gap_frequency=1.0, min_conservation=0.0) self.assertEqual(actual, alignment2)
def test_empty_input(self): alignment = skbio.TabularMSA( [skbio.DNA('', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('', metadata={'id': 'seq3', 'description': ''})] ) with self.assertRaises(ValueError): mask(alignment) alignment = skbio.TabularMSA([]) with self.assertRaises(ValueError): mask(alignment)
def trim(OGid): # 0 Load MSA try: msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa') except FileNotFoundError: msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa') # 1 Calculate shared variables gaps_array = np.full((len(msa1), len(msa1[0][1])), False) for i, (_, seq) in enumerate(msa1): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1]) # 2 Get trims (segments and columns) syms_list1 = trim_conserved(msa1, scores, gaps_array, tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig']) syms_list2, trims = trim_insertions(msa1, scores, gaps_array, tp['gap_num'], tp['gap_rate'], tp['gap_minsig'], tp['nongap_frac'], tp['nongap_minlen'], tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'], tp['weights'], tp['threshold'], matrix) # 3 Combine trims (segments and columns) to yield final alignment msa2 = [] for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2): syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)] # Will only differ if one is converted to gap msa2.append((seq.metadata['description'], syms)) # 4 Restore gap only columns gaps_array = np.full((len(msa2), len(msa2[0][1])), False) for i, (_, seq) in enumerate(msa2): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) rf = ['x' for _ in range(len(msa2[0][1]))] # Metadata for marking consensus columns in profile HMM for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]): rf[region] = (region.stop - region.start) * ['.'] for i in range(len(msa2)): syms = msa2[i][1] syms[region] = list(str(msa1[i, region])) # 5 Write to file msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2], positional_metadata={'RF': rf}) msa2.write(f'out/{OGid}.sto', 'stockholm')
def test_create_terminal_gap_mask_all(self): seqs = [ skbio.DNA('----', metadata=dict(id='s1')), skbio.DNA('AGAT', metadata=dict(id='s2')), skbio.DNA('----', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_terminal_gap_mask(msa, self.mask2) npt.assert_array_equal(obs, [True, True, True, True]) seqs = [ skbio.DNA('ACG-', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('----', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_terminal_gap_mask(msa, self.mask5) npt.assert_array_equal(obs, [True, True, True, True])
def _prepare_sequence_data(self): sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') sequences = DNAFASTAFormat(sequences_fp, mode='r') alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta') alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r') exp = skbio.TabularMSA([ skbio.DNA('AGGGGG-', metadata={ 'id': 'aln-seq-1', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'aln-seq-2', 'description': '' }), skbio.DNA('AGGGGGG', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-GGGGGG', metadata={ 'id': 'seq2', 'description': '' }) ]) return alignment, sequences, exp
def test_mask2_wo_terminal_gap_mask(self): obs = mask(self.msa1, self.mask2, "mask", False) seqs = [ skbio.DNA('ACG', metadata=dict(id='s1')), skbio.DNA('AG-', metadata=dict(id='s2')), skbio.DNA('-C-', metadata=dict(id='s3'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = mask(self.msa1, self.mask2, "caution", False) seqs = [ skbio.DNA('AG', metadata=dict(id='s1')), skbio.DNA('A-', metadata=dict(id='s2')), skbio.DNA('--', metadata=dict(id='s3'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def test_apply_mask_mask_all(self): obs = _apply_mask(self.msa1, np.array([True, True, True, True])) seqs = [ skbio.DNA('', metadata=dict(id='s1')), skbio.DNA('', metadata=dict(id='s2')), skbio.DNA('', metadata=dict(id='s3'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def _prepare_sequence_data(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') exp = skbio.TabularMSA( [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})] ) return input_sequences, exp
def test_reconstruct_fragment_rep_seqs(self): recon_map = Artifact.import_data( 'FeatureData[SidleReconstruction]', pd.DataFrame(data=[['seq01|seq02'], ['seq01|seq02'], ['seq03|seq04'], ['seq03|seq04'], ['seq05']], index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'], name='db-seq'), columns=['clean_name']) ) recon_summary = Artifact.import_data( 'FeatureData[ReconstructionSummary]', Metadata(pd.DataFrame(data=[[1, 2, 2, 0, 'asv01|asv02'], [2, 3, 1.5, np.std([1, 2], ddof=1), 'asv03|asv04'], [2, 2, 1, 0, 'asv07|asv08']], index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'], name='feature-id'), columns=['num-regions', 'total-kmers-mapped', 'mean-kmer-per-region', 'stdv-kmer-per-region', 'mapped-asvs'])) ) aligned_seqs = Artifact.import_data( 'FeatureData[AlignedSequence]', skbio.TabularMSA([ DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq01'}), DNA('CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq02'}), DNA('CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGC-', metadata={'id': 'seq03'}), DNA('------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGCC', metadata={'id': 'seq04'}), DNA('CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA' 'GCCACTGACGTGCG', metadata={'id': 'seq05'}), ]) ) known = pd.Series( data=['GCGAAGCGGCTCAGG', 'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC'], index=pd.Index(['seq01|seq02', 'seq03|seq04']), ) test = sidle.reconstruct_fragment_rep_seqs( region=['Bludhaven', 'Gotham'], kmer_map=[Artifact.load(os.path.join(self.base_dir, 'frag_r1_db_map.qza')), Artifact.load(os.path.join(self.base_dir, 'frag_r2_db_map.qza'))], reconstruction_map=recon_map, reconstruction_summary=recon_summary, aligned_sequences=aligned_seqs, ).representative_fragments pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
def test_error_on_empty_alignment_conservation_boundary(self): alignment1 = skbio.TabularMSA( [skbio.DNA('A', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('C', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('G', metadata={'id': 'seq3', 'description': ''})]) self.assertRaisesRegex(ValueError, " 0.00% of positions were retained by the con", mask, alignment1, max_gap_frequency=1.0, min_conservation=0.5)
def test_create_terminal_gap_mask_two_chrome(self): obs = _create_terminal_gap_mask(self.msa1, self.mask2) npt.assert_array_equal(obs, [False, False, False, False]) seqs = [ skbio.DNA('-CGT', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('-C-T', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_terminal_gap_mask(msa, self.mask2) npt.assert_array_equal(obs, [True, False, False, False]) seqs = [ skbio.DNA('-CG-', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('-C--', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_terminal_gap_mask(msa, self.mask2) npt.assert_array_equal(obs, [True, False, False, True])
def test_create_position_map_all_gaps(self): seqs = [ skbio.DNA('ACGT', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('----', metadata=dict(id='s3'))] msa = skbio.TabularMSA(seqs, minter='id') obs = _create_position_map(msa, 's3') exp = np.array([]) npt.assert_array_equal(obs, exp)
def test_invalid_conservation_threshold(self): alignment = skbio.TabularMSA( [skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}), skbio.DNA('-', metadata={'id': 'seq2', 'description': ''}), skbio.DNA('-', metadata={'id': 'seq3', 'description': ''})] ) eps = np.finfo(float).eps with self.assertRaises(ValueError): mask(alignment, min_conservation=0.0 - eps) with self.assertRaises(ValueError): mask(alignment, min_conservation=1.0 + eps)
def setUp(self): super().setUp() _, self.mask1 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask1.vcf') _, self.mask2 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask2.vcf') _, self.mask3 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask3.vcf') _, self.mask4 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask4.vcf') _, self.mask5 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask5.vcf') _, self.mask6 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask6.vcf') _, self.mask7 = self.transform_format(VCFMaskFormat, pd.DataFrame, filename='mask7.vcf') seqs = [ skbio.DNA('ACGT', metadata=dict(id='s1')), skbio.DNA('AG-T', metadata=dict(id='s2')), skbio.DNA('-C-T', metadata=dict(id='s3')) ] self.msa1 = skbio.TabularMSA(seqs, minter='id') seqs = [ skbio.DNA('TCNTGNNNGGTGCCA-CC--AAA--', metadata=dict(id='s1')), skbio.DNA('TCNTGCTCGGTGCCA-CC--AAAT-', metadata=dict(id='s2')), skbio.DNA('TCNTGCTCGGTACCA-CC--AAA--', metadata=dict(id='s3')), skbio.DNA('-CN-GCTCGGTGCCA-CCGGAAACT', metadata=dict(id='S_4')), skbio.DNA('TCNTGCTCGGTGCCA-CC--AAATT', metadata=dict(id='seq5.555')), skbio.DNA('--NTGCTCGGTGCCA-CC--AAAT-', metadata=dict(id='s11')) ] self.msa2 = skbio.TabularMSA(seqs, minter='id')
def test_mask4_w_terminal_gap_mask(self): obs = mask(self.msa2, self.mask4, "mask", True) seqs = [ skbio.DNA('NNNGGTGCCA-CC--A', metadata=dict(id='s1')), skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='s2')), skbio.DNA('CTCGGTACCA-CC--A', metadata=dict(id='s3')), skbio.DNA('CTCGGTGCCA-CCGGA', metadata=dict(id='S_4')), skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='seq5.555')), skbio.DNA('CTCGGTGCCA-CC--A', metadata=dict(id='s11'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp) obs = mask(self.msa2, self.mask4, "caution", True) seqs = [ skbio.DNA('NNNGGCCA-CC--A', metadata=dict(id='s1')), skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s2')), skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s3')), skbio.DNA('CTCGGCCA-CCGGA', metadata=dict(id='S_4')), skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='seq5.555')), skbio.DNA('CTCGGCCA-CC--A', metadata=dict(id='s11'))] exp = skbio.TabularMSA(seqs, minter='id') self.assertEqual(obs, exp)
def test_mafft(self): input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta') input_sequences = DNAFASTAFormat(input_fp, mode='r') exp = skbio.TabularMSA([ skbio.DNA('AGGGGGG', metadata={ 'id': 'seq1', 'description': '' }), skbio.DNA('-GGGGGG', metadata={ 'id': 'seq2', 'description': '' }) ]) with redirected_stdio(stderr=os.devnull): result = mafft(input_sequences) obs = skbio.io.read(str(result), into=skbio.TabularMSA, constructor=skbio.DNA) self.assertEqual(obs, exp)
rows = [] for OGid in OGids: try: msa = read_fasta(f'../align_fastas1/out/{OGid}.mfa') except FileNotFoundError: msa = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa') gaps_array = np.full((len(msa), len(msa[0][1])), False) for i, (_, seq) in enumerate(msa): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) msa = skbio.TabularMSA([ skbio.Protein(seq, metadata={'description': header}) for header, seq in msa ]) mask = ndimage.label(len(msa) - scores <= tp['gap_num'])[0] regions = [region for region, in ndimage.find_objects(mask)] for region in regions: for segment in get_segments(msa, region, matrix): row = { 'OGid': OGid, 'start': segment['region'].start, 'stop': segment['region'].stop, 'index': segment['index'], 'length': sum([s.stop - s.start for s in segment['slices']]) } rows.append(row)
def test_reconstruct_fragment_rep_seqs(self): recon_map = Artifact.import_data( 'FeatureData[SidleReconstruction]', pd.DataFrame( data=np.array([ ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15], ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15], ['seq03|seq04', 0, 'WANTCAT', 1, 'CACCTCGTN', 15], ['seq03|seq04', 0, 'CACCTCGTN', 1, 'CACCTCGTN', 15], ['seq05', 0, 'WANTCAT', 1, 'CACCTCGTN', 15], ], dtype=object), index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'], name='db-seq'), columns=[ 'clean_name', 'first-region', 'first-fwd-primer', 'last-region', 'last-fwd-primer', 'last-kmer-length' ], )) recon_summary = Artifact.import_data( 'FeatureData[ReconstructionSummary]', Metadata( pd.DataFrame( data=[[1, 2, 2, 0, 'asv01|asv02'], [2, 3, 1.5, np.std([1, 2], ddof=1), 'asv03|asv04'], [2, 2, 1, 0, 'asv07|asv08']], index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'], name='feature-id'), columns=[ 'num-regions', 'total-kmers-mapped', 'mean-kmer-per-region', 'stdv-kmer-per-region', 'mapped-asvs' ]))) aligned_seqs = Artifact.import_data( 'FeatureData[AlignedSequence]', skbio.TabularMSA([ DNA( 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq01'}), DNA( 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------' '--------------', metadata={'id': 'seq02'}), DNA( 'CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGC-', metadata={'id': 'seq03'}), DNA( '------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC' 'GCTTCTGACGTGCC', metadata={'id': 'seq04'}), DNA( 'CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA' 'GCCACTGACGTGCG', metadata={'id': 'seq05'}), ])) known = pd.Series( data=[ 'GCGAAGCGGCTCAGG', 'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC' ], index=pd.Index(['seq01|seq02', 'seq03|seq04']), ) test = sidle.reconstruct_fragment_rep_seqs( reconstruction_map=recon_map, reconstruction_summary=recon_summary, aligned_sequences=aligned_seqs, ).representative_fragments pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
def align2skbio(align): return skbio.TabularMSA( [Sequence(s, metadata=dict(id=str(i))) for i, s in align.items()])