def test_write_results(self): """Test writing HGT results to FASTA files. """ self.genes_recip['D_2_hgt_n'] = ['MKKNIILNLIGLRCPEPIMI', 321, 381, '+'] donor_genbank_fp = join(self.proteomes_dir, "donor.fna") recipient_genbank_fp = join(self.proteomes_dir, "recip.fna") dnr_g_nucl_fp, dnr_g_aa_fp, dnr_g_gb_fp, \ rcp_g_nucl_fp, rcp_g_aa_fp, rcp_g_gb_fp =\ write_results(self.genes_donor, donor_genbank_fp, self.genes_recip, recipient_genbank_fp, self.seq_donor, self.seq_recip, self.simulated_dir) donor_nucl = Sequence.read(dnr_g_nucl_fp, format='fasta') # test for correctness of donor nucleotide genome sequence self.assertEqual(str(donor_nucl), str(self.seq_donor)) recip_nucl = Sequence.read(rcp_g_nucl_fp, format='fasta') # test for correctness of recipient nucleotide genome sequence self.assertEqual(str(recip_nucl), str(self.seq_recip)) locus = {'unit': 'bp', 'shape': 'circular', 'division': 'CON', 'mol_type': 'DNA', 'date': '01-JAN-1900'} donor_gb = Sequence.read(dnr_g_gb_fp, format='genbank') locus['locus_name'] = 'donor' locus['size'] = len(str(self.seq_donor)) # test for correctness of donor GenBank file self.assertEqual(str(donor_gb), str(self.seq_donor)) self.assertDictEqual(donor_gb.metadata['LOCUS'], locus) recip_gb = Sequence.read(rcp_g_gb_fp, format='genbank') locus['locus_name'] = 'recipient' locus['size'] = len(str(self.seq_recip)) # test for correctness of recipient GenBank file self.assertEqual(str(recip_gb), str(self.seq_recip)) self.assertDictEqual(recip_gb.metadata['LOCUS'], locus) donor_aa_dict = {} for seq in skbio.io.read(dnr_g_aa_fp, format='fasta'): donor_aa_dict[seq.metadata['id']] = str(seq) # test for correctness of donor protein coding sequences self.assertTrue(len(donor_aa_dict), len(self.genes_donor)) for gene in donor_aa_dict: self.assertTrue(gene in self.genes_donor) self.assertEqual(donor_aa_dict[gene], self.genes_donor[gene][0]) recip_aa_dict = {} for seq in skbio.io.read(rcp_g_aa_fp, format='fasta'): recip_aa_dict[seq.metadata['id']] = str(seq) # test for correctness of recipient protein coding sequences self.assertTrue(len(recip_aa_dict), len(self.genes_recip)) for gene in recip_aa_dict: self.assertTrue(gene in self.genes_recip) self.assertEqual(recip_aa_dict[gene], self.genes_recip[gene][0])
def test_write_results(self): """Test writing HGT results to FASTA files. """ self.genes_recip['D_2_hgt_n'] = ['MKKNIILNLIGLRCPEPIMI', 321, 381, '+'] donor_genbank_fp = join(self.proteomes_dir, "donor.fna") recipient_genbank_fp = join(self.proteomes_dir, "recip.fna") dnr_g_nucl_fp, dnr_g_aa_fp, rcp_g_nucl_fp, rcp_g_aa_fp =\ write_results(self.genes_donor, donor_genbank_fp, self.genes_recip, recipient_genbank_fp, self.seq_donor, self.seq_recip, self.simulated_dir) donor_nucl = Sequence.read(dnr_g_nucl_fp, format='fasta') # test for correctness of donor nucleotide genome sequence self.assertEqual(str(donor_nucl), str(self.seq_donor)) recip_nucl = Sequence.read(rcp_g_nucl_fp, format='fasta') # test for correctness of recipient nucleotide genome sequence self.assertEqual(str(recip_nucl), str(self.seq_recip)) donor_aa_dict = {} for seq in skbio.io.read(dnr_g_aa_fp, format='fasta'): donor_aa_dict[seq.metadata['id']] = str(seq) # test for correctness of donor protein coding sequences self.assertTrue(len(donor_aa_dict), len(self.genes_donor)) for gene in donor_aa_dict: self.assertTrue(gene in self.genes_donor) self.assertEqual(donor_aa_dict[gene], self.genes_donor[gene][0]) recip_aa_dict = {} for seq in skbio.io.read(rcp_g_aa_fp, format='fasta'): recip_aa_dict[seq.metadata['id']] = str(seq) # test for correctness of recipient protein coding sequences self.assertTrue(len(recip_aa_dict), len(self.genes_recip)) for gene in recip_aa_dict: self.assertTrue(gene in self.genes_recip) self.assertEqual(recip_aa_dict[gene], self.genes_recip[gene][0])
def parse_genemark(input_f, genbank_fp): """ Extract atypical genes identified by GeneMark Parameters ---------- input_f: string file descriptor for GeneMark output gene list (*.lst) genbank_fp: string file path to genome in GenBank format Notes ----- genbank_fp is the intermediate GenBank file generated by reformat_input.py, in which multiple sequences are concantenated, instead of the original GenBank file. Returns ------- output: string gene names (protein_ids) separated by newline """ genes = {} gb = Sequence.read(genbank_fp, format='genbank') for feature in gb.interval_metadata._intervals: m = feature.metadata if m['type'] == 'CDS' and 'protein_id' in m: protein_id = m['protein_id'].replace('\"', '') if protein_id not in genes: strand = m['strand'] start = feature.bounds[0][0] + 1 end = feature.bounds[0][1] genes[protein_id] = (start, end, strand) atypical_genes = [] reading = False for line in input_f: l = line.strip().split() if len(l) == 2 and l == ['#', 'Length']: reading = True # atypical genes have class '2' in the 6th column elif reading and len(l) == 6 and l[5] == '2': (start, end, strand) = (int(l[2].lstrip('<>')), int(l[3].lstrip('<>')), l[1]) for (gene, l) in genes.items(): if l[0] == start and l[1] == end and l[2] == strand: atypical_genes.append(gene) return '\n'.join(sorted(atypical_genes))
def parse_egid(input_f, genbank_fp): """ Extract genes contained in GIs identified by EGID Parameters ---------- input_f: string file descriptor for EGID output results (GI coordinates) genbank_fp: string file path to genome in GenBank format (containing gene coordinates) Notes ----- genbank_fp is the intermediate GenBank file generated by reformat_input.py, in which multiple sequences are concantenated, instead of the original GenBank file. Returns ------- output: string gene names (protein_ids) separated by newline """ genes = {} gb = Sequence.read(genbank_fp, format='genbank') for feature in gb.interval_metadata._intervals: m = feature.metadata if m['type'] == 'CDS' and 'protein_id' in m: protein_id = m['protein_id'].replace('\"', '') if protein_id not in genes: # in scikit-bio, this number is the start location - 1 start = feature.bounds[0][0] + 1 end = feature.bounds[0][1] genes[protein_id] = (start, end) genes_in_gi = {} for line in input_f: l = line.strip().split() # a valid GI definition should have at least 2 columns if len(l) < 2: continue start = int(l[0]) end = int(l[1]) for (gene, pos) in genes.items(): if (pos[0] >= start and pos[1] <= end): if gene not in genes_in_gi: genes_in_gi[gene] = 1 return '\n'.join(sorted(genes_in_gi))
def extract_genbank(genbank_fp, verbose=False): """Extract protein coding sequences from GenBank record. Parameters ---------- genbank_fp: string file path to genome in GenBank format Returns ------- seq: skbio.sequence.Sequence Sequence object genes: dictionary a dictionary of genes (CDS) and their info, with the key being the protein IDs and the value being a 4-element list including the translated sequence, the start and end positions in the genome """ genes = {} if verbose: sys.stdout.write('\tParse GenBank record ...\n') seq = Sequence.read(genbank_fp, format='genbank') if verbose: sys.stdout.write('\t\tDone.\n') for feature in seq.interval_metadata._intervals: m = feature.metadata if m['type'] == 'CDS': protein_id = m['protein_id'] translation = m['translation'] strand = m['strand'] # in scikit-bio, this number is the start location - 1 start = feature.bounds[0][0] + 1 end = feature.bounds[0][1] gene = protein_id.replace('\"', '') if gene not in genes: genes[gene] = [translation.replace(' ', '').replace('\"', ''), start, end, strand] else: raise KeyError('%s already exists in dictionary' % gene) return seq, genes
def extract_genbank(genbank_fp, verbose=False): """Extract protein coding sequences from GenBank record. Parameters ---------- genbank_fp: string file path to genome in GenBank format Returns ------- seq: skbio.sequence.Sequence Sequence object genes: dictionary a dictionary of genes (CDS) and their info, with the key being the protein IDs and the value being a 4-element list including the translated sequence, the start and end positions in the genome """ genes = {} if verbose: sys.stdout.write("\tParse GenBank record ...\n") seq = Sequence.read(genbank_fp, format='genbank') if verbose: sys.stdout.write("\t\tDone.\n") for feature in seq.interval_metadata.features: if feature['type_'] == 'CDS': protein_id = feature['protein_id'] translation = feature['translation'] strand = '-' if feature['rc_'] else '+' loc = seq.interval_metadata.features[feature] start_pos = loc[0][0] end_pos = loc[0][1] if protein_id not in genes: genes[protein_id.replace("\"", "")] = [ translation.replace(" ", "").replace("\"", ""), start_pos, end_pos, strand] else: raise KeyError("%s already exists in dictionary" % protein_id) return seq, genes
def test_traceback(self): score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([Sequence("ACG-", metadata={'id': '0'})], [Sequence("ACGT", metadata={'id': '1'})], 1, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG', metadata={'id': ''})]), Alignment([DNA('ACGT', metadata={'id': ''})]), 4, 3) self.assertEqual(actual, expected) # four sequences in two alignments score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([ Sequence("ACG-", metadata={'id': 's1'}), Sequence("ACG-", metadata={'id': 's2'}) ], [ Sequence("ACGT", metadata={'id': 's3'}), Sequence("ACGT", metadata={'id': 's4'}) ], 1, 0, 0) actual = _traceback( tback_m, score_m, Alignment([ DNA('ACG', metadata={'id': 's1'}), DNA('ACG', metadata={'id': 's2'}) ]), Alignment([ DNA('ACGT', metadata={'id': 's3'}), DNA('ACGT', metadata={'id': 's4'}) ]), 4, 3) self.assertEqual(actual, expected) # start at highest-score expected = ([Sequence("ACG", metadata={'id': '0'})], [Sequence("ACG", metadata={'id': '1'})], 6, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG', metadata={'id': ''})]), Alignment([DNA('ACGT', metadata={'id': ''})]), 3, 3) self.assertEqual(actual, expected) # terminate traceback before top-right tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) expected = ("G", "G", 6, 2, 2) expected = ([Sequence("G", metadata={'id': '0'})], [Sequence("G", metadata={'id': '1'})], 6, 2, 2) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG', metadata={'id': ''})]), Alignment([DNA('ACGT', metadata={'id': ''})]), 3, 3) self.assertEqual(actual, expected)
def _merge_genbank_seqs(genbank_fp): """ Merge one to multiple sequences in a GenBank file into one. Parameters ---------- genbank_fp: string file path to genome in GenBank format Returns ------- tuple of ( skbio.Sequence, Genome sequence, genes and metadata dict of { list of [ string, int, int, string ] } Gene name : translation, start, end, and strand ) """ loci = [] nucl_seq = '' genes = {} nseq = 0 # number of nucleotide sequences with open(genbank_fp, 'r') as input_f: for line in input_f: if line.startswith('//'): nseq += 1 abs_pos = 0 # absolute position in concantenated nucleotide sequence for i in range(nseq): gb = Sequence.read(genbank_fp, seq_num=i+1, format='genbank') locus_name = gb.metadata['LOCUS']['locus_name'] size = gb.metadata['LOCUS']['size'] loci.append([locus_name, size]) nucl_seq += str(gb) for feature in gb.interval_metadata._intervals: m = feature.metadata if m['type'] == 'CDS' and 'protein_id' in m: protein_id = m['protein_id'].replace('\"', '') if protein_id not in genes: translation = m['translation'].replace(' ', '') \ .replace('\"', '') strand = m['strand'] start = feature.bounds[0][0] + abs_pos + 1 end = feature.bounds[0][1] + abs_pos genes[protein_id] = [translation, start, end, strand] abs_pos += int(size) gb = DNA(nucl_seq) # generate mock metadata for the merged sequence gb.metadata['LOCUS'] = {'locus_name': 'locus001', 'size': len(nucl_seq), 'unit': 'bp', 'shape': 'circular', 'division': 'CON', 'mol_type': 'DNA', 'date': '01-JAN-1900'} gb.metadata['id'] = 'locus001' gid = 1 # assign an incremental integer to the current gene gb.interval_metadata._intervals = [] for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]): # generate "gene" and "CDS" records for each protein-coding gene location = str(l[1]) + '..' + str(l[2]) # start and end coordinates if l[3] == '-': # negative strand location = 'complement(' + location + ')' feature = {'type': 'gene', 'locus_tag': 'gene' + str(gid), '__location': location} gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature) feature = {'type': 'CDS', 'locus_tag': 'gene' + str(gid), '__location': location, 'protein_id': gene, 'translation': l[0]} gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature) gid += 1 return (gb, genes)
def gen(): yield Sequence('ACGT', metadata={'id': 'foo', 'description': 'bar'}, positional_metadata={'quality': range(4)}) yield Sequence('ACG', metadata={'id': 'foo', 'description': 'bar'})
def simulate_orthologous_rep(genes_donor, seq_donor, genes_recip, seq_recip, sequence_ids, orthologous_groups, orthologous_rep_prob, percentage_hgts, log_f): """Simulate orthologous replacement HGT. Parameters ---------- genes_donor: dictionary A dictionary of genes, key are protein IDs values 5-element lists seq_donor: skbio.sequence.Sequence Sequence object for donor genome genes_recip: dictionary A dictionary of genes, key are protein IDs values 5-element lists seq_recip: skbio.sequence.Sequence Sequence object for recipient genome sequence_ids: dictionary Keys are in the form x_y (species_gene) and values are original accessions orthologous_groups: list of lists List of orthologous families between donor and recipient proteomes orthologous_rep_prob: float Probably HGT will be orthologous replacement percentage_hgts: float Percent of HGTs to simulate log_f: file descriptor Log file descriptor Returns ------- seq_recip: skbio.sequence.Sequence recipient genome sequence with HGTs Notes ----- Using list of orthologous genes between donor and recipient genomes, randomly choose genes to exchange from donor to recipient and output results to FASTA protein and nucleotide files. Algorithm: 1. Choose randomly N orthogroups to be used for simulating HGTs 2. For each orthogroup, choose randomly a donor and recipient gene 3. Replace recipient gene with donor and output to FASTA protein and nucleotide files. """ # number of HGTs to simulate num_hgts = int(percentage_hgts * orthologous_rep_prob * len(genes_recip)) if num_hgts < 1: num_hgts = 1 num_orthogroups = len(orthologous_groups) idx = random.sample(range(num_orthogroups), num_hgts) log_f.write("#type\tdonor\tstart\tend\trecipient\tnew label " "recipient\tstart\tend\tstrand\n") seq_recip_seq = str(seq_recip) for i in idx: orthogroup = orthologous_groups[i] substitute_genes = ['*', '*'] # Randomly select two orthologous genes from the same family # representing the donor and recipient genomes. Each orthogroup is # guranteed to have at least two genes, one from donor (prefixed with # '0') and second from recipient (prefixed with '1'). The following # while loop will continue until an index for two genes prefixed with # '0' and '1' is selected. At each iteration the chances the while # loop must continue reduce exponentially since the index is chosen # randomly from the same set of options. while '*' in substitute_genes: idx2 = random.randrange(0, len(orthogroup)) gene = orthogroup[idx2] if (gene.startswith('0') and substitute_genes[0] == '*'): substitute_genes[0] = sequence_ids[gene] elif (gene.startswith('1') and substitute_genes[1] == '*'): substitute_genes[1] = sequence_ids[gene] # match donor and recipient gene labels to results output by # OrthoFinder (in sequence_ids) gene_donor_label = None gene_recip_label = None if substitute_genes[0] in genes_donor: gene_donor_label = substitute_genes[0] gene_recip_label = substitute_genes[1] elif substitute_genes[1] in genes_donor: gene_donor_label = substitute_genes[1] gene_recip_label = substitute_genes[0] else: raise ValueError("Gene %s and %s are not in donor genome" % (substitute_genes[0], substitute_genes[1])) # rename recipient orthologous gene to donor's hgt_gene = "%s_hgt_o" % gene_donor_label genes_recip[hgt_gene] = genes_recip.pop(gene_recip_label) # replace recipient gene (translated sequence) with donor's genes_recip[hgt_gene][0] = genes_donor[gene_donor_label][0] # update end position of HGT gene (as it can be shorter/longer than # the recipient gene replaced), multiply length of substituted gene # by 3 to translate from codon to nucleotide length genes_recip[hgt_gene][2] =\ genes_recip[hgt_gene][1] + len(genes_recip[hgt_gene][0])*3 # replace recipient gene (nucleotide format) with donor's start_pos_recip, end_pos_recip, strand_recip =\ genes_recip[hgt_gene][1:] start_pos_donor, end_pos_donor, strand_donor =\ genes_donor[gene_donor_label][1:] seq_recip_seq = (str(seq_recip_seq[:start_pos_recip]) + str(seq_donor[start_pos_donor:end_pos_donor]) + str(seq_recip_seq[end_pos_recip:])) if strand_recip != strand_donor: genes_recip[hgt_gene][3] = genes_donor[gene_donor_label][3] # write HGTs to log file log_f.write("o\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (gene_donor_label, start_pos_donor, end_pos_donor, gene_recip_label, hgt_gene, start_pos_recip, end_pos_recip, strand_donor)) seq_recip = Sequence(seq_recip_seq, metadata=seq_recip.metadata) return seq_recip
from skbio import Sequence import skbio from datetime import datetime import os import numpy as np genomeFile = open('/home/castle/Downloads/GenomeDataTable1/aaa.fna') genomeread = genomeFile.read() print(len(genomeread)) genomeSeq = Sequence(genomeread) t = datetime.now() genomeKmers = set(map(str, genomeSeq.iter_kmers(4, overlap=True))) print("Kmer counting Elapse Time with SKBIO : " + (datetime.now() - t))
def missing_qual_gen(): for seq in (RNA('A', positional_metadata={'quality': [42]}), Sequence('AG'), DNA('GG', positional_metadata={'quality': [41, 40]})): yield seq
def test_no_kmers_found(self): seq1 = Sequence('ATCG') seq2 = Sequence('ACGT') obs = kmer_distance(seq1, seq2, 5) npt.assert_equal(obs, np.nan)
def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1, outfmt='sam', params=None): '''Annotate the sequences in the file.''' if self.has_cache() and not self.cache.is_empty(): self.cache.build() dbs = [self.cache.db] + self.dat else: dbs = self.dat seqs = [] found = set() res = pd.DataFrame() logger = getLogger(__name__) for db in dbs: out_prefix = splitext(basename(db))[0] daa_fp = join(self.out_dir, '%s.daa' % out_prefix) out_fp = join(self.out_dir, '%s.diamond' % out_prefix) self.run_blast(fp, daa_fp, db, aligner=aligner, evalue=evalue, cpus=cpus, params=params) self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt}) # res = res.append(self.parse_tabular(out_fp)) if outfmt == 'tab': res = res.append(self._filter_best(self.parse_tabular(out_fp))) elif outfmt == 'sam': res = res.append(self._filter_id_cov(self.parse_sam(out_fp))) # save to a tmp file the seqs that do not hit current database new_fp = join(self.tmp_dir, '%s.fa' % out_prefix) found = found | set(res.index) with open(new_fp, 'w') as f: for seq in read(fp, format='fasta'): if seq.metadata['id'] not in found: seq.write(f, format='fasta') logger.info('Number of diamond hits: %d' % len(res.index)) # no seq left if stat(new_fp).st_size == 0: break else: fp = new_fp if outfmt == 'sam' and self.has_cache(): for x in res.index: seqs.append( Sequence(res.loc[x, 'sseq'], metadata={'id': res.loc[x, 'sseqid']})) # Update cache (inplace) if self.has_cache(): self.cache.update(seqs) self.cache.close() return res
def test_empty_sequences(self): seq1 = Sequence('') seq2 = Sequence('') obs = kmer_distance(seq1, seq2, 3) npt.assert_equal(obs, np.nan)
def test_one_empty_sequence(self): seq1 = Sequence('') seq2 = Sequence('CGGGCAGCTCCTACCTGCTA') obs = kmer_distance(seq1, seq2, 3) exp = 1.0 self.assertAlmostEqual(obs, exp)
def test_return_type(self): seq1 = Sequence('ATCG') seq2 = Sequence('ATCG') obs = kmer_distance(seq1, seq2, 3) self.assertIsInstance(obs, float) self.assertEqual(obs, 0.0)
def test_differing_length_seqs(self): seq1 = Sequence('AGAAATCTGAGCAAGGATCA') seq2 = Sequence('TTAGTGCGTAATCCG') obs = kmer_distance(seq1, seq2, 3) exp = 0.9285714285714286 self.assertAlmostEqual(obs, exp)
def test_same_sequence(self): seq1 = Sequence('CTGCGACAGTTGGTA') seq2 = Sequence('CTGCGACAGTTGGTA') obs = kmer_distance(seq1, seq2, 3) exp = 0.0 self.assertEqual(obs, exp)
def test_entirely_different_sequences(self): seq1 = Sequence('CCGTGGTCGTATAAG') seq2 = Sequence('CGCCTTCCACATCAG') obs = kmer_distance(seq1, seq2, 3) exp = 1.0 self.assertEqual(obs, exp)
def test_k_less_than_one_error(self): seq1 = Sequence('ATCG') seq2 = Sequence('ACTG') with self.assertRaisesRegex(ValueError, r'k must be greater than 0.'): kmer_distance(seq1, seq2, 0)
def blank_seq_gen(): yield from (DNA('A'), Sequence(''), RNA('GG'))
def test_type_mismatch_error(self): seq1 = Sequence('ABC') seq2 = DNA('ATC') with self.assertRaisesRegex(TypeError, r"Type 'Sequence'.*type 'DNA'"): kmer_distance(seq1, seq2, 3)
def calculateHammingDistance(seq1, seq2): """Returns hamming distance between two equal length sequences""" seq1 = Sequence(seq1) seq2 = Sequence(seq2) result = hamming(seq1, seq2) return result
def test_non_sequence_error(self): seq1 = Sequence('ATCG') seq2 = 'ATCG' with self.assertRaisesRegex(TypeError, r"not 'str'"): kmer_distance(seq1, seq2, 3)
def blank_seq_gen(): for seq in (DNA('A'), Sequence(''), RNA('GG')): yield seq
def test_length_mismatch(self): seq1 = Sequence('ABC') seq2 = Sequence('ABCD') with self.assertRaisesRegex(ValueError, r'equal length.*3 != 4'): hamming(seq1, seq2)
def gen(): for c in components: yield Sequence( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': c[3]})
def test_default_kwargs(self): seq1 = Sequence('AACCTAGCAATGGAT') seq2 = Sequence('CAGGCAGTTCTCACC') obs = kmer_distance(seq1, seq2, 3) exp = 0.9130434782608695 self.assertAlmostEqual(obs, exp)
def setUp(self): self.multi_fp = get_data_path('gff3_multi_record') self.single_fp = get_data_path('gff3_single_record') intvls = [{ 'bounds': [(0, 4641652)], 'metadata': { 'source': 'European Nucleotide Archive', 'type': 'chromosome', 'score': '.', 'strand': '.', 'ID': 'chromosome:Chromosome', 'Alias': 'U00096.3', 'Is_circular': 'true' } }, { 'bounds': [(147, 148)], 'metadata': { 'source': 'regulondb_feature', 'type': 'biological_region', 'score': '.', 'strand': '+', 'external_name': 'Promoter thrLp (RegulonDB:ECK120010236)', 'logic_name': 'regulondb_promoter' } }, { 'bounds': [(336, 2799)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gc_cont': '0.427' } }, { 'bounds': [(336, 2799)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'CDS', 'score': '333.8', 'strand': '+', 'phase': 0, 'ID': '1_2', 'Parent': '1_1', 'rbs_motif': 'GGAG/GAGG', 'rbs_spacer': '5-10bp' } }, { 'bounds': [(0, 50), (55, 100)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gene': 'FXR receptor' } }] self.upper_bound = 4641652 self.imd1 = IntervalMetadata(self.upper_bound) self.imd1.add(**intvls[0]) self.imd1.add(**intvls[1]) self.imd2 = IntervalMetadata(None) self.imd2.add(**intvls[2]) self.imd2.add(**intvls[3]) self.imd3 = IntervalMetadata(None) self.imd3.add(**intvls[4]) self.seq_fp = get_data_path('gff3_dna') self.seq = Sequence('ATGCATGCATGC', metadata={ 'id': 'NC_1', 'description': 'species X' }) self.seq.interval_metadata.add( [(0, 9)], metadata={ 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '.', 'strand': '+', 'phase': 0, 'ID': 'gene1', 'Name': 'FXR' }) self.dna = DNA(self.seq)
def test_invalid_type(self): with self.assertRaisesRegex(TypeError, r"not type 'Sequence'"): local_pairwise_align_ssw(DNA('ACGT'), Sequence('ACGT')) with self.assertRaisesRegex(TypeError, r"not type 'str'"): local_pairwise_align_ssw('ACGU', RNA('ACGU'))
def simulate_novel_acq(genes_donor, seq_donor, genes_recip, seq_recip, orthologous_rep_prob, percentage_hgts, log_f): """Simulate novel gene acquisition HGT. Parameters ---------- genes_donor: dictionary A dictionary of genes, key are protein IDs values 5-element lists seq_donor: skbio.sequence.Sequence Sequence object for donor genome genes_recip: dictionary A dictionary of genes, key are protein IDs values 5-element lists seq_recip: skbio.sequence.Sequence Sequence object for recipient genome orthologous_rep_prob: float Probably HGT will be orthologous replacement percentage_hgts: float Percent of HGTs to simulate log_f: file descriptor Log file descriptor Returns ------- seq_recip: skbio.sequence.Sequence recipient genome sequence with HGTs Notes ----- Algorithm: 1. choose random location in recipient genome where to insert a gene (chosen from list of donor genes) 2. use gene (recipient) positioning array to locate an open region (that doesn't include an existing gene) near the random location to insert the new gene (we want to avoid gene overlap so that compositional methods can clearly pick out individual coding genes) 3. insert new gene, record existance in gene positioning array """ # compute number of HGTs to simulate (novel acquisition) num_hgts = int(percentage_hgts * (1 - orthologous_rep_prob) * len(genes_recip)) num_hgts = max(1, num_hgts) # add start and end positions of recipient genome to allow for HGTs # simulated before the first and after the last existing gene gene_positions = [(0, 0), (len(seq_recip), len(seq_recip))] # create recipient genome gene positioning array for seq, start, stop, strand in genes_recip.values(): gene_positions.append((start, stop)) # sort array for gene positions in ascending order gene_positions_s = sorted(gene_positions) # select a random list of positions where to insert the new gene idx = random.sample(range(len(gene_positions_s) - 1), num_hgts) gene_donor_labels = random.sample(list(genes_donor), num_hgts) log_f.write("#type\tdonor\tstart\tend\trecipient\tstart\t" "end\tstrand\n") seq_recip_seq = str(seq_recip) # begin simulation for x in range(num_hgts): # select donor gene (for HGT) gene_donor_label = gene_donor_labels[x] idx_recip = gene_positions_s[idx[x]][1] + 1 # beginning from valid position for inserting new gene, check whether # the length can fit without overlapping with existing gene, otherwise # search for next valid position for y in range(idx[x], len(gene_positions_s) - 1): if idx_recip + len(genes_donor[gene_donor_label][0])*3 <\ gene_positions_s[y+1][0]: # codon = sequence of 3 nucleotides (hence *3) idx_end = idx_recip + len(genes_donor[gene_donor_label][0]) * 3 # insert gene (protein) hgt_gene = "%s_hgt_n" % gene_donor_label genes_recip[hgt_gene] =\ [genes_donor[gene_donor_label][0], idx_recip, idx_end, genes_donor[gene_donor_label][3]] # insert gene (nucleotide) seq_recip_seq = ( str(seq_recip_seq[:idx_recip]) + str(seq_donor[genes_donor[gene_donor_label][1]: genes_donor[gene_donor_label][2]]) + str(seq_recip_seq[idx_recip:])) # write HGTs to log file log_f.write( "n\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (gene_donor_label, genes_donor[gene_donor_label][1], genes_donor[gene_donor_label][2], hgt_gene, idx_recip, idx_end, genes_donor[gene_donor_label][3])) break # try next open region idx_recip = gene_positions_s[y + 1][1] + 1 seq_recip = Sequence(seq_recip_seq, metadata=seq_recip.metadata) return seq_recip
def test_global_pairwise_align_invalid_type(self): with self.assertRaisesRegex(TypeError, "GrammaredSequence.*" "TabularMSA.*'Sequence'"): global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def hamming_distance(s1, s2): s1 = Sequence(s1) s2 = Sequence(s2) return s1.distance(s2)
def test_local_pairwise_align_invalid_type(self): with self.assertRaisesRegex(TypeError, 'GrammaredSequence.*Sequence'): local_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def test_single_character_sequences(self): seq1 = Sequence('a') seq2 = Sequence('b') self.assertEqual(hamming(seq1, seq1), 0.0) self.assertEqual(hamming(seq1, seq2), 1.0)
def test_overlap_false(self): seq1 = Sequence('CGTTATGTCTGTGAT') seq2 = Sequence('CTGAATCGGTAGTGT') obs = kmer_distance(seq1, seq2, 3, overlap=False) exp = 0.8888888888888888 self.assertAlmostEqual(obs, exp)