Example #1
0
 def test_write_results(self):
     """Test writing HGT results to FASTA files.
     """
     self.genes_recip['D_2_hgt_n'] = ['MKKNIILNLIGLRCPEPIMI', 321, 381, '+']
     donor_genbank_fp = join(self.proteomes_dir, "donor.fna")
     recipient_genbank_fp = join(self.proteomes_dir, "recip.fna")
     dnr_g_nucl_fp, dnr_g_aa_fp, dnr_g_gb_fp, \
         rcp_g_nucl_fp, rcp_g_aa_fp, rcp_g_gb_fp =\
         write_results(self.genes_donor,
                       donor_genbank_fp,
                       self.genes_recip,
                       recipient_genbank_fp,
                       self.seq_donor,
                       self.seq_recip,
                       self.simulated_dir)
     donor_nucl = Sequence.read(dnr_g_nucl_fp, format='fasta')
     # test for correctness of donor nucleotide genome sequence
     self.assertEqual(str(donor_nucl), str(self.seq_donor))
     recip_nucl = Sequence.read(rcp_g_nucl_fp, format='fasta')
     # test for correctness of recipient nucleotide genome sequence
     self.assertEqual(str(recip_nucl), str(self.seq_recip))
     locus = {'unit': 'bp', 'shape': 'circular', 'division': 'CON',
              'mol_type': 'DNA', 'date': '01-JAN-1900'}
     donor_gb = Sequence.read(dnr_g_gb_fp, format='genbank')
     locus['locus_name'] = 'donor'
     locus['size'] = len(str(self.seq_donor))
     # test for correctness of donor GenBank file
     self.assertEqual(str(donor_gb), str(self.seq_donor))
     self.assertDictEqual(donor_gb.metadata['LOCUS'], locus)
     recip_gb = Sequence.read(rcp_g_gb_fp, format='genbank')
     locus['locus_name'] = 'recipient'
     locus['size'] = len(str(self.seq_recip))
     # test for correctness of recipient GenBank file
     self.assertEqual(str(recip_gb), str(self.seq_recip))
     self.assertDictEqual(recip_gb.metadata['LOCUS'], locus)
     donor_aa_dict = {}
     for seq in skbio.io.read(dnr_g_aa_fp, format='fasta'):
         donor_aa_dict[seq.metadata['id']] = str(seq)
     # test for correctness of donor protein coding sequences
     self.assertTrue(len(donor_aa_dict), len(self.genes_donor))
     for gene in donor_aa_dict:
         self.assertTrue(gene in self.genes_donor)
         self.assertEqual(donor_aa_dict[gene], self.genes_donor[gene][0])
     recip_aa_dict = {}
     for seq in skbio.io.read(rcp_g_aa_fp, format='fasta'):
         recip_aa_dict[seq.metadata['id']] = str(seq)
     # test for correctness of recipient protein coding sequences
     self.assertTrue(len(recip_aa_dict), len(self.genes_recip))
     for gene in recip_aa_dict:
         self.assertTrue(gene in self.genes_recip)
         self.assertEqual(recip_aa_dict[gene], self.genes_recip[gene][0])
Example #2
0
 def test_write_results(self):
     """Test writing HGT results to FASTA files.
     """
     self.genes_recip['D_2_hgt_n'] = ['MKKNIILNLIGLRCPEPIMI', 321, 381, '+']
     donor_genbank_fp = join(self.proteomes_dir, "donor.fna")
     recipient_genbank_fp = join(self.proteomes_dir, "recip.fna")
     dnr_g_nucl_fp, dnr_g_aa_fp, rcp_g_nucl_fp, rcp_g_aa_fp =\
         write_results(self.genes_donor,
                       donor_genbank_fp,
                       self.genes_recip,
                       recipient_genbank_fp,
                       self.seq_donor,
                       self.seq_recip,
                       self.simulated_dir)
     donor_nucl = Sequence.read(dnr_g_nucl_fp, format='fasta')
     # test for correctness of donor nucleotide genome sequence
     self.assertEqual(str(donor_nucl), str(self.seq_donor))
     recip_nucl = Sequence.read(rcp_g_nucl_fp, format='fasta')
     # test for correctness of recipient nucleotide genome sequence
     self.assertEqual(str(recip_nucl), str(self.seq_recip))
     donor_aa_dict = {}
     for seq in skbio.io.read(dnr_g_aa_fp, format='fasta'):
         donor_aa_dict[seq.metadata['id']] = str(seq)
     # test for correctness of donor protein coding sequences
     self.assertTrue(len(donor_aa_dict), len(self.genes_donor))
     for gene in donor_aa_dict:
         self.assertTrue(gene in self.genes_donor)
         self.assertEqual(donor_aa_dict[gene], self.genes_donor[gene][0])
     recip_aa_dict = {}
     for seq in skbio.io.read(rcp_g_aa_fp, format='fasta'):
         recip_aa_dict[seq.metadata['id']] = str(seq)
     # test for correctness of recipient protein coding sequences
     self.assertTrue(len(recip_aa_dict), len(self.genes_recip))
     for gene in recip_aa_dict:
         self.assertTrue(gene in self.genes_recip)
         self.assertEqual(recip_aa_dict[gene], self.genes_recip[gene][0])
Example #3
0
def parse_genemark(input_f, genbank_fp):
    """ Extract atypical genes identified by GeneMark

    Parameters
    ----------
    input_f: string
        file descriptor for GeneMark output gene list (*.lst)
    genbank_fp: string
        file path to genome in GenBank format

    Notes
    -----
    genbank_fp is the intermediate GenBank file generated by reformat_input.py,
    in which multiple sequences are concantenated, instead of the original
    GenBank file.

    Returns
    -------
    output: string
        gene names (protein_ids) separated by newline
    """
    genes = {}
    gb = Sequence.read(genbank_fp, format='genbank')
    for feature in gb.interval_metadata._intervals:
        m = feature.metadata
        if m['type'] == 'CDS' and 'protein_id' in m:
            protein_id = m['protein_id'].replace('\"', '')
            if protein_id not in genes:
                strand = m['strand']
                start = feature.bounds[0][0] + 1
                end = feature.bounds[0][1]
                genes[protein_id] = (start, end, strand)
    atypical_genes = []
    reading = False
    for line in input_f:
        l = line.strip().split()
        if len(l) == 2 and l == ['#', 'Length']:
            reading = True
        # atypical genes have class '2' in the 6th column
        elif reading and len(l) == 6 and l[5] == '2':
            (start, end, strand) = (int(l[2].lstrip('<>')),
                                    int(l[3].lstrip('<>')),
                                    l[1])
            for (gene, l) in genes.items():
                if l[0] == start and l[1] == end and l[2] == strand:
                    atypical_genes.append(gene)
    return '\n'.join(sorted(atypical_genes))
Example #4
0
def parse_egid(input_f, genbank_fp):
    """ Extract genes contained in GIs identified by EGID

    Parameters
    ----------
    input_f: string
        file descriptor for EGID output results (GI coordinates)
    genbank_fp: string
        file path to genome in GenBank format (containing gene coordinates)

    Notes
    -----
    genbank_fp is the intermediate GenBank file generated by reformat_input.py,
    in which multiple sequences are concantenated, instead of the original
    GenBank file.

    Returns
    -------
    output: string
        gene names (protein_ids) separated by newline
    """
    genes = {}
    gb = Sequence.read(genbank_fp, format='genbank')
    for feature in gb.interval_metadata._intervals:
        m = feature.metadata
        if m['type'] == 'CDS' and 'protein_id' in m:
            protein_id = m['protein_id'].replace('\"', '')
            if protein_id not in genes:
                # in scikit-bio, this number is the start location - 1
                start = feature.bounds[0][0] + 1
                end = feature.bounds[0][1]
                genes[protein_id] = (start, end)
    genes_in_gi = {}
    for line in input_f:
        l = line.strip().split()
        # a valid GI definition should have at least 2 columns
        if len(l) < 2:
            continue
        start = int(l[0])
        end = int(l[1])
        for (gene, pos) in genes.items():
            if (pos[0] >= start and pos[1] <= end):
                if gene not in genes_in_gi:
                    genes_in_gi[gene] = 1
    return '\n'.join(sorted(genes_in_gi))
Example #5
0
def extract_genbank(genbank_fp, verbose=False):
    """Extract protein coding sequences from GenBank record.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format

    Returns
    -------
    seq: skbio.sequence.Sequence
        Sequence object
    genes: dictionary
        a dictionary of genes (CDS) and their info, with the key being the
        protein IDs and the value being a 4-element list including the
        translated sequence, the start and end positions in the genome
    """
    genes = {}
    if verbose:
        sys.stdout.write('\tParse GenBank record ...\n')
    seq = Sequence.read(genbank_fp, format='genbank')
    if verbose:
        sys.stdout.write('\t\tDone.\n')
    for feature in seq.interval_metadata._intervals:
        m = feature.metadata
        if m['type'] == 'CDS':
            protein_id = m['protein_id']
            translation = m['translation']
            strand = m['strand']
            # in scikit-bio, this number is the start location - 1
            start = feature.bounds[0][0] + 1
            end = feature.bounds[0][1]
            gene = protein_id.replace('\"', '')
            if gene not in genes:
                genes[gene] = [translation.replace(' ', '').replace('\"', ''),
                               start, end, strand]
            else:
                raise KeyError('%s already exists in dictionary' % gene)
    return seq, genes
Example #6
0
def extract_genbank(genbank_fp, verbose=False):
    """Extract protein coding sequences from GenBank record.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format

    Returns
    -------
    seq: skbio.sequence.Sequence
        Sequence object
    genes: dictionary
        a dictionary of genes (CDS) and their info, with the key being the
        protein IDs and the value being a 4-element list including the
        translated sequence, the start and end positions in the genome
    """
    genes = {}
    if verbose:
        sys.stdout.write("\tParse GenBank record ...\n")
    seq = Sequence.read(genbank_fp, format='genbank')
    if verbose:
        sys.stdout.write("\t\tDone.\n")
    for feature in seq.interval_metadata.features:
        if feature['type_'] == 'CDS':
            protein_id = feature['protein_id']
            translation = feature['translation']
            strand = '-' if feature['rc_'] else '+'
            loc = seq.interval_metadata.features[feature]
            start_pos = loc[0][0]
            end_pos = loc[0][1]
            if protein_id not in genes:
                genes[protein_id.replace("\"", "")] = [
                    translation.replace(" ", "").replace("\"", ""),
                    start_pos, end_pos, strand]
            else:
                raise KeyError("%s already exists in dictionary" % protein_id)
    return seq, genes
Example #7
0
    def test_traceback(self):
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([Sequence("ACG-", metadata={'id': '0'})],
                    [Sequence("ACGT", metadata={'id': '1'})], 1, 0, 0)
        actual = _traceback(tback_m, score_m,
                            Alignment([DNA('ACG', metadata={'id': ''})]),
                            Alignment([DNA('ACGT', metadata={'id': ''})]), 4,
                            3)
        self.assertEqual(actual, expected)

        # four sequences in two alignments
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([
            Sequence("ACG-", metadata={'id': 's1'}),
            Sequence("ACG-", metadata={'id': 's2'})
        ], [
            Sequence("ACGT", metadata={'id': 's3'}),
            Sequence("ACGT", metadata={'id': 's4'})
        ], 1, 0, 0)
        actual = _traceback(
            tback_m, score_m,
            Alignment([
                DNA('ACG', metadata={'id': 's1'}),
                DNA('ACG', metadata={'id': 's2'})
            ]),
            Alignment([
                DNA('ACGT', metadata={'id': 's3'}),
                DNA('ACGT', metadata={'id': 's4'})
            ]), 4, 3)
        self.assertEqual(actual, expected)

        # start at highest-score
        expected = ([Sequence("ACG", metadata={'id': '0'})],
                    [Sequence("ACG", metadata={'id': '1'})], 6, 0, 0)
        actual = _traceback(tback_m, score_m,
                            Alignment([DNA('ACG', metadata={'id': ''})]),
                            Alignment([DNA('ACGT', metadata={'id': ''})]), 3,
                            3)
        self.assertEqual(actual, expected)

        # terminate traceback before top-right
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        expected = ("G", "G", 6, 2, 2)
        expected = ([Sequence("G", metadata={'id': '0'})],
                    [Sequence("G", metadata={'id': '1'})], 6, 2, 2)
        actual = _traceback(tback_m, score_m,
                            Alignment([DNA('ACG', metadata={'id': ''})]),
                            Alignment([DNA('ACGT', metadata={'id': ''})]), 3,
                            3)
        self.assertEqual(actual, expected)
Example #8
0
def _merge_genbank_seqs(genbank_fp):
    """ Merge one to multiple sequences in a GenBank file into one.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format

    Returns
    -------
    tuple of (
        skbio.Sequence,
            Genome sequence, genes and metadata
        dict of { list of [ string, int, int, string ] }
            Gene name : translation, start, end, and strand
    )
    """
    loci = []
    nucl_seq = ''
    genes = {}
    nseq = 0  # number of nucleotide sequences
    with open(genbank_fp, 'r') as input_f:
        for line in input_f:
            if line.startswith('//'):
                nseq += 1
    abs_pos = 0  # absolute position in concantenated nucleotide sequence
    for i in range(nseq):
        gb = Sequence.read(genbank_fp, seq_num=i+1, format='genbank')
        locus_name = gb.metadata['LOCUS']['locus_name']
        size = gb.metadata['LOCUS']['size']
        loci.append([locus_name, size])
        nucl_seq += str(gb)
        for feature in gb.interval_metadata._intervals:
            m = feature.metadata
            if m['type'] == 'CDS' and 'protein_id' in m:
                protein_id = m['protein_id'].replace('\"', '')
                if protein_id not in genes:
                    translation = m['translation'].replace(' ', '') \
                        .replace('\"', '')
                    strand = m['strand']
                    start = feature.bounds[0][0] + abs_pos + 1
                    end = feature.bounds[0][1] + abs_pos
                    genes[protein_id] = [translation, start, end, strand]
        abs_pos += int(size)
    gb = DNA(nucl_seq)
    # generate mock metadata for the merged sequence
    gb.metadata['LOCUS'] = {'locus_name': 'locus001', 'size': len(nucl_seq),
                            'unit': 'bp', 'shape': 'circular',
                            'division': 'CON', 'mol_type': 'DNA',
                            'date': '01-JAN-1900'}
    gb.metadata['id'] = 'locus001'
    gid = 1  # assign an incremental integer to the current gene
    gb.interval_metadata._intervals = []
    for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]):
        # generate "gene" and "CDS" records for each protein-coding gene
        location = str(l[1]) + '..' + str(l[2])  # start and end coordinates
        if l[3] == '-':  # negative strand
            location = 'complement(' + location + ')'
        feature = {'type': 'gene', 'locus_tag': 'gene' + str(gid),
                   '__location': location}
        gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature)
        feature = {'type': 'CDS', 'locus_tag': 'gene' + str(gid),
                   '__location': location, 'protein_id': gene,
                   'translation': l[0]}
        gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature)
        gid += 1
    return (gb, genes)
Example #9
0
 def gen():
     yield Sequence('ACGT',
                    metadata={'id': 'foo', 'description': 'bar'},
                    positional_metadata={'quality': range(4)})
     yield Sequence('ACG', metadata={'id': 'foo', 'description': 'bar'})
Example #10
0
def simulate_orthologous_rep(genes_donor, seq_donor, genes_recip, seq_recip,
                             sequence_ids, orthologous_groups,
                             orthologous_rep_prob, percentage_hgts, log_f):
    """Simulate orthologous replacement HGT.

    Parameters
    ----------
    genes_donor: dictionary
        A dictionary of genes, key are protein IDs values 5-element lists
    seq_donor: skbio.sequence.Sequence
        Sequence object for donor genome
    genes_recip: dictionary
        A dictionary of genes, key are protein IDs values 5-element lists
    seq_recip: skbio.sequence.Sequence
        Sequence object for recipient genome
    sequence_ids: dictionary
        Keys are in the form x_y (species_gene) and values are original
        accessions
    orthologous_groups: list of lists
        List of orthologous families between donor and recipient proteomes
    orthologous_rep_prob: float
        Probably HGT will be orthologous replacement
    percentage_hgts: float
        Percent of HGTs to simulate
    log_f: file descriptor
        Log file descriptor

    Returns
    -------
    seq_recip: skbio.sequence.Sequence
        recipient genome sequence with HGTs

    Notes
    -----
    Using list of orthologous genes between donor and recipient genomes,
    randomly choose genes to exchange from donor to recipient and output
    results to FASTA protein and nucleotide files.

    Algorithm:
        1. Choose randomly N orthogroups to be used for simulating HGTs
        2. For each orthogroup, choose randomly a donor and recipient gene
        3. Replace recipient gene with donor and output to FASTA protein
           and nucleotide files.
    """
    # number of HGTs to simulate
    num_hgts = int(percentage_hgts * orthologous_rep_prob * len(genes_recip))
    if num_hgts < 1:
        num_hgts = 1
    num_orthogroups = len(orthologous_groups)
    idx = random.sample(range(num_orthogroups), num_hgts)
    log_f.write("#type\tdonor\tstart\tend\trecipient\tnew label "
                "recipient\tstart\tend\tstrand\n")
    seq_recip_seq = str(seq_recip)
    for i in idx:
        orthogroup = orthologous_groups[i]
        substitute_genes = ['*', '*']
        # Randomly select two orthologous genes from the same family
        # representing the donor and recipient genomes. Each orthogroup is
        # guranteed to have at least two genes, one from donor (prefixed with
        # '0') and second from recipient (prefixed with '1'). The following
        # while loop will continue until an index for two genes prefixed with
        # '0' and '1' is selected. At each iteration the chances the while
        # loop must continue reduce exponentially since the index is chosen
        # randomly from the same set of options.
        while '*' in substitute_genes:
            idx2 = random.randrange(0, len(orthogroup))
            gene = orthogroup[idx2]
            if (gene.startswith('0') and substitute_genes[0] == '*'):
                substitute_genes[0] = sequence_ids[gene]
            elif (gene.startswith('1') and substitute_genes[1] == '*'):
                substitute_genes[1] = sequence_ids[gene]
        # match donor and recipient gene labels to results output by
        # OrthoFinder (in sequence_ids)
        gene_donor_label = None
        gene_recip_label = None
        if substitute_genes[0] in genes_donor:
            gene_donor_label = substitute_genes[0]
            gene_recip_label = substitute_genes[1]
        elif substitute_genes[1] in genes_donor:
            gene_donor_label = substitute_genes[1]
            gene_recip_label = substitute_genes[0]
        else:
            raise ValueError("Gene %s and %s are not in donor genome" %
                             (substitute_genes[0], substitute_genes[1]))
        # rename recipient orthologous gene to donor's
        hgt_gene = "%s_hgt_o" % gene_donor_label
        genes_recip[hgt_gene] = genes_recip.pop(gene_recip_label)
        # replace recipient gene (translated sequence) with donor's
        genes_recip[hgt_gene][0] = genes_donor[gene_donor_label][0]
        # update end position of HGT gene (as it can be shorter/longer than
        # the recipient gene replaced), multiply length of substituted gene
        # by 3 to translate from codon to nucleotide length
        genes_recip[hgt_gene][2] =\
            genes_recip[hgt_gene][1] + len(genes_recip[hgt_gene][0])*3
        # replace recipient gene (nucleotide format) with donor's
        start_pos_recip, end_pos_recip, strand_recip =\
            genes_recip[hgt_gene][1:]
        start_pos_donor, end_pos_donor, strand_donor =\
            genes_donor[gene_donor_label][1:]
        seq_recip_seq = (str(seq_recip_seq[:start_pos_recip]) +
                         str(seq_donor[start_pos_donor:end_pos_donor]) +
                         str(seq_recip_seq[end_pos_recip:]))
        if strand_recip != strand_donor:
            genes_recip[hgt_gene][3] = genes_donor[gene_donor_label][3]
        # write HGTs to log file
        log_f.write("o\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                    (gene_donor_label, start_pos_donor, end_pos_donor,
                     gene_recip_label, hgt_gene, start_pos_recip,
                     end_pos_recip, strand_donor))
    seq_recip = Sequence(seq_recip_seq, metadata=seq_recip.metadata)
    return seq_recip
Example #11
0
from skbio import Sequence
import skbio
from datetime import datetime
import os
import numpy as np

genomeFile = open('/home/castle/Downloads/GenomeDataTable1/aaa.fna')

genomeread = genomeFile.read()
print(len(genomeread))

genomeSeq = Sequence(genomeread)
t = datetime.now()

genomeKmers = set(map(str, genomeSeq.iter_kmers(4, overlap=True)))

print("Kmer counting Elapse Time with SKBIO : " + (datetime.now() - t))
Example #12
0
 def missing_qual_gen():
     for seq in (RNA('A', positional_metadata={'quality':
                                               [42]}), Sequence('AG'),
                 DNA('GG', positional_metadata={'quality': [41, 40]})):
         yield seq
Example #13
0
 def test_no_kmers_found(self):
     seq1 = Sequence('ATCG')
     seq2 = Sequence('ACGT')
     obs = kmer_distance(seq1, seq2, 5)
     npt.assert_equal(obs, np.nan)
Example #14
0
    def _annotate_fp(self,
                     fp,
                     aligner='blastp',
                     evalue=0.001,
                     cpus=1,
                     outfmt='sam',
                     params=None):
        '''Annotate the sequences in the file.'''

        if self.has_cache() and not self.cache.is_empty():
            self.cache.build()
            dbs = [self.cache.db] + self.dat
        else:
            dbs = self.dat

        seqs = []
        found = set()
        res = pd.DataFrame()
        logger = getLogger(__name__)
        for db in dbs:
            out_prefix = splitext(basename(db))[0]
            daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
            out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
            self.run_blast(fp,
                           daa_fp,
                           db,
                           aligner=aligner,
                           evalue=evalue,
                           cpus=cpus,
                           params=params)
            self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
            # res = res.append(self.parse_tabular(out_fp))
            if outfmt == 'tab':
                res = res.append(self._filter_best(self.parse_tabular(out_fp)))
            elif outfmt == 'sam':
                res = res.append(self._filter_id_cov(self.parse_sam(out_fp)))

            # save to a tmp file the seqs that do not hit current database
            new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
            found = found | set(res.index)
            with open(new_fp, 'w') as f:
                for seq in read(fp, format='fasta'):
                    if seq.metadata['id'] not in found:
                        seq.write(f, format='fasta')
            logger.info('Number of diamond hits: %d' % len(res.index))

            # no seq left
            if stat(new_fp).st_size == 0:
                break
            else:
                fp = new_fp
        if outfmt == 'sam' and self.has_cache():
            for x in res.index:
                seqs.append(
                    Sequence(res.loc[x, 'sseq'],
                             metadata={'id': res.loc[x, 'sseqid']}))

        # Update cache (inplace)
        if self.has_cache():
            self.cache.update(seqs)
            self.cache.close()
        return res
Example #15
0
 def test_empty_sequences(self):
     seq1 = Sequence('')
     seq2 = Sequence('')
     obs = kmer_distance(seq1, seq2, 3)
     npt.assert_equal(obs, np.nan)
Example #16
0
 def test_one_empty_sequence(self):
     seq1 = Sequence('')
     seq2 = Sequence('CGGGCAGCTCCTACCTGCTA')
     obs = kmer_distance(seq1, seq2, 3)
     exp = 1.0
     self.assertAlmostEqual(obs, exp)
Example #17
0
 def test_return_type(self):
     seq1 = Sequence('ATCG')
     seq2 = Sequence('ATCG')
     obs = kmer_distance(seq1, seq2, 3)
     self.assertIsInstance(obs, float)
     self.assertEqual(obs, 0.0)
Example #18
0
 def test_differing_length_seqs(self):
     seq1 = Sequence('AGAAATCTGAGCAAGGATCA')
     seq2 = Sequence('TTAGTGCGTAATCCG')
     obs = kmer_distance(seq1, seq2, 3)
     exp = 0.9285714285714286
     self.assertAlmostEqual(obs, exp)
Example #19
0
 def test_same_sequence(self):
     seq1 = Sequence('CTGCGACAGTTGGTA')
     seq2 = Sequence('CTGCGACAGTTGGTA')
     obs = kmer_distance(seq1, seq2, 3)
     exp = 0.0
     self.assertEqual(obs, exp)
Example #20
0
 def test_entirely_different_sequences(self):
     seq1 = Sequence('CCGTGGTCGTATAAG')
     seq2 = Sequence('CGCCTTCCACATCAG')
     obs = kmer_distance(seq1, seq2, 3)
     exp = 1.0
     self.assertEqual(obs, exp)
Example #21
0
 def test_k_less_than_one_error(self):
     seq1 = Sequence('ATCG')
     seq2 = Sequence('ACTG')
     with self.assertRaisesRegex(ValueError, r'k must be greater than 0.'):
         kmer_distance(seq1, seq2, 0)
Example #22
0
 def blank_seq_gen():
     yield from (DNA('A'), Sequence(''), RNA('GG'))
Example #23
0
 def test_type_mismatch_error(self):
     seq1 = Sequence('ABC')
     seq2 = DNA('ATC')
     with self.assertRaisesRegex(TypeError, r"Type 'Sequence'.*type 'DNA'"):
         kmer_distance(seq1, seq2, 3)
Example #24
0
def calculateHammingDistance(seq1, seq2):
    """Returns hamming distance between two equal length sequences"""
    seq1 = Sequence(seq1)
    seq2 = Sequence(seq2)
    result = hamming(seq1, seq2)
    return result
Example #25
0
 def test_non_sequence_error(self):
     seq1 = Sequence('ATCG')
     seq2 = 'ATCG'
     with self.assertRaisesRegex(TypeError, r"not 'str'"):
         kmer_distance(seq1, seq2, 3)
Example #26
0
 def blank_seq_gen():
     for seq in (DNA('A'), Sequence(''), RNA('GG')):
         yield seq
Example #27
0
    def test_length_mismatch(self):
        seq1 = Sequence('ABC')
        seq2 = Sequence('ABCD')

        with self.assertRaisesRegex(ValueError, r'equal length.*3 != 4'):
            hamming(seq1, seq2)
Example #28
0
 def gen():
     for c in components:
         yield Sequence(
             c[2], metadata={'id': c[0], 'description': c[1]},
             positional_metadata={'quality': c[3]})
Example #29
0
 def test_default_kwargs(self):
     seq1 = Sequence('AACCTAGCAATGGAT')
     seq2 = Sequence('CAGGCAGTTCTCACC')
     obs = kmer_distance(seq1, seq2, 3)
     exp = 0.9130434782608695
     self.assertAlmostEqual(obs, exp)
Example #30
0
    def setUp(self):
        self.multi_fp = get_data_path('gff3_multi_record')
        self.single_fp = get_data_path('gff3_single_record')

        intvls = [{
            'bounds': [(0, 4641652)],
            'metadata': {
                'source': 'European Nucleotide Archive',
                'type': 'chromosome',
                'score': '.',
                'strand': '.',
                'ID': 'chromosome:Chromosome',
                'Alias': 'U00096.3',
                'Is_circular': 'true'
            }
        }, {
            'bounds': [(147, 148)],
            'metadata': {
                'source': 'regulondb_feature',
                'type': 'biological_region',
                'score': '.',
                'strand': '+',
                'external_name': 'Promoter thrLp (RegulonDB:ECK120010236)',
                'logic_name': 'regulondb_promoter'
            }
        }, {
            'bounds': [(336, 2799)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '1.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_1',
                'gc_cont': '0.427'
            }
        }, {
            'bounds': [(336, 2799)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'CDS',
                'score': '333.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_2',
                'Parent': '1_1',
                'rbs_motif': 'GGAG/GAGG',
                'rbs_spacer': '5-10bp'
            }
        }, {
            'bounds': [(0, 50), (55, 100)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '1.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_1',
                'gene': 'FXR receptor'
            }
        }]

        self.upper_bound = 4641652
        self.imd1 = IntervalMetadata(self.upper_bound)
        self.imd1.add(**intvls[0])
        self.imd1.add(**intvls[1])

        self.imd2 = IntervalMetadata(None)
        self.imd2.add(**intvls[2])
        self.imd2.add(**intvls[3])

        self.imd3 = IntervalMetadata(None)
        self.imd3.add(**intvls[4])

        self.seq_fp = get_data_path('gff3_dna')
        self.seq = Sequence('ATGCATGCATGC',
                            metadata={
                                'id': 'NC_1',
                                'description': 'species X'
                            })
        self.seq.interval_metadata.add(
            [(0, 9)],
            metadata={
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '.',
                'strand': '+',
                'phase': 0,
                'ID': 'gene1',
                'Name': 'FXR'
            })
        self.dna = DNA(self.seq)
Example #31
0
    def test_invalid_type(self):
        with self.assertRaisesRegex(TypeError, r"not type 'Sequence'"):
            local_pairwise_align_ssw(DNA('ACGT'), Sequence('ACGT'))

        with self.assertRaisesRegex(TypeError, r"not type 'str'"):
            local_pairwise_align_ssw('ACGU', RNA('ACGU'))
Example #32
0
def simulate_novel_acq(genes_donor, seq_donor, genes_recip, seq_recip,
                       orthologous_rep_prob, percentage_hgts, log_f):
    """Simulate novel gene acquisition HGT.

    Parameters
    ----------
    genes_donor: dictionary
        A dictionary of genes, key are protein IDs values 5-element lists
    seq_donor: skbio.sequence.Sequence
        Sequence object for donor genome
    genes_recip: dictionary
        A dictionary of genes, key are protein IDs values 5-element lists
    seq_recip: skbio.sequence.Sequence
        Sequence object for recipient genome
    orthologous_rep_prob: float
        Probably HGT will be orthologous replacement
    percentage_hgts: float
        Percent of HGTs to simulate
    log_f: file descriptor
        Log file descriptor

    Returns
    -------
    seq_recip: skbio.sequence.Sequence
        recipient genome sequence with HGTs

    Notes
    -----
    Algorithm:
        1. choose random location in recipient genome where to insert a gene
           (chosen from list of donor genes)
        2. use gene (recipient) positioning array to locate an open region
           (that doesn't include an existing gene) near the random location
           to insert the new gene (we want to avoid gene overlap so that
           compositional methods can clearly pick out individual coding
           genes)
        3. insert new gene, record existance in gene positioning array
    """
    # compute number of HGTs to simulate (novel acquisition)
    num_hgts = int(percentage_hgts * (1 - orthologous_rep_prob) *
                   len(genes_recip))
    num_hgts = max(1, num_hgts)
    # add start and end positions of recipient genome to allow for HGTs
    # simulated before the first and after the last existing gene
    gene_positions = [(0, 0), (len(seq_recip), len(seq_recip))]
    # create recipient genome gene positioning array
    for seq, start, stop, strand in genes_recip.values():
        gene_positions.append((start, stop))
    # sort array for gene positions in ascending order
    gene_positions_s = sorted(gene_positions)
    # select a random list of positions where to insert the new gene
    idx = random.sample(range(len(gene_positions_s) - 1), num_hgts)
    gene_donor_labels = random.sample(list(genes_donor), num_hgts)
    log_f.write("#type\tdonor\tstart\tend\trecipient\tstart\t" "end\tstrand\n")
    seq_recip_seq = str(seq_recip)
    # begin simulation
    for x in range(num_hgts):
        # select donor gene (for HGT)
        gene_donor_label = gene_donor_labels[x]
        idx_recip = gene_positions_s[idx[x]][1] + 1
        # beginning from valid position for inserting new gene, check whether
        # the length can fit without overlapping with existing gene, otherwise
        # search for next valid position
        for y in range(idx[x], len(gene_positions_s) - 1):
            if idx_recip + len(genes_donor[gene_donor_label][0])*3 <\
                    gene_positions_s[y+1][0]:
                # codon = sequence of 3 nucleotides (hence *3)
                idx_end = idx_recip + len(genes_donor[gene_donor_label][0]) * 3
                # insert gene (protein)
                hgt_gene = "%s_hgt_n" % gene_donor_label
                genes_recip[hgt_gene] =\
                    [genes_donor[gene_donor_label][0], idx_recip, idx_end,
                     genes_donor[gene_donor_label][3]]
                # insert gene (nucleotide)
                seq_recip_seq = (
                    str(seq_recip_seq[:idx_recip]) +
                    str(seq_donor[genes_donor[gene_donor_label][1]:
                                  genes_donor[gene_donor_label][2]]) +
                    str(seq_recip_seq[idx_recip:]))
                # write HGTs to log file
                log_f.write(
                    "n\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                    (gene_donor_label, genes_donor[gene_donor_label][1],
                     genes_donor[gene_donor_label][2], hgt_gene, idx_recip,
                     idx_end, genes_donor[gene_donor_label][3]))
                break
            # try next open region
            idx_recip = gene_positions_s[y + 1][1] + 1
    seq_recip = Sequence(seq_recip_seq, metadata=seq_recip.metadata)
    return seq_recip
Example #33
0
 def test_global_pairwise_align_invalid_type(self):
     with self.assertRaisesRegex(TypeError,
                                 "GrammaredSequence.*"
                                 "TabularMSA.*'Sequence'"):
         global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
Example #34
0
def hamming_distance(s1, s2):
    s1 = Sequence(s1)
    s2 = Sequence(s2)
    return s1.distance(s2)
Example #35
0
 def test_local_pairwise_align_invalid_type(self):
     with self.assertRaisesRegex(TypeError,
                                 'GrammaredSequence.*Sequence'):
         local_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
Example #36
0
    def test_single_character_sequences(self):
        seq1 = Sequence('a')
        seq2 = Sequence('b')

        self.assertEqual(hamming(seq1, seq1), 0.0)
        self.assertEqual(hamming(seq1, seq2), 1.0)
Example #37
0
 def test_overlap_false(self):
     seq1 = Sequence('CGTTATGTCTGTGAT')
     seq2 = Sequence('CTGAATCGGTAGTGT')
     obs = kmer_distance(seq1, seq2, 3, overlap=False)
     exp = 0.8888888888888888
     self.assertAlmostEqual(obs, exp)