Example #1
0
def get_gene_distance(seq_1, seq_2):
    """
    Returns hamming distance between two DNA sequences
    Alignment based on Striped Smith-Waterman algorithm
    """
    query = StripedSmithWaterman(seq_1.upper())
    alignment = query(seq_2.upper())
    q = DNA(alignment.aligned_query_sequence)
    t = DNA(alignment.aligned_target_sequence)
    return q.distance(t)
Example #2
0
    def test_transcribe_preserves_all_metadata(self):
        im = IntervalMetadata(4)
        im.add([(0, 2)], metadata={'gene': 'p53'})

        exp = RNA('AGUU', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        seq = DNA('AGTT', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        self.assertEqual(seq.transcribe(), exp)
Example #3
0
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False):
    roi_dict = {'region':roi.position_range}
    range_match = re.search('(\d*)-(\d*)', roi.position_range)
    if not range_match:
        return roi_dict
    start = int(range_match.group(1)) - 1
    end = int(range_match.group(2))
    aa_sequence_counter = Counter()
    nt_sequence_counter = Counter()
    depth = 0
    for read in samdata.fetch(amplicon_ref, start, end):
        rstart = read.reference_start
        if rstart <= start:
            nt_sequence = DNA(read.query_alignment_sequence[start-rstart:end-rstart])
            if reverse_comp:
                nt_sequence = nt_sequence.reverse_complement()
            #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now
            if nt_sequence.has_degenerates(): 
                continue
            aa_sequence = nt_sequence.translate()
            aa_string = str(aa_sequence).replace('*', 'x')
            if aa_string:
                nt_sequence_counter.update([str(nt_sequence)])
                aa_sequence_counter.update([aa_string])
                depth += 1
    if len(aa_sequence_counter) == 0:
        roi_dict['flag'] = "region not found"
        return roi_dict
    aa_consensus = aa_sequence_counter.most_common(1)[0][0]
    nt_consensus = nt_sequence_counter.most_common(1)[0][0]
    num_changes = 0
    reference = roi.aa_sequence
    consensus = aa_consensus
    if roi.nt_sequence:
        reference = roi.nt_sequence
        consensus = nt_consensus
    for i in range(len(reference)):
        if len(consensus) <= i or reference[i] != consensus[i]:
            num_changes += 1
    roi_dict['most_common_aa_sequence'] = aa_consensus
    roi_dict['most_common_nt_sequence'] = nt_consensus
    roi_dict['reference'] = reference
    roi_dict['changes'] = str(num_changes)
    roi_dict['aa_sequence_distribution'] = aa_sequence_counter
    roi_dict['nt_sequence_distribution'] = nt_sequence_counter
    roi_dict['depth'] = str(depth)
    return roi_dict
Example #4
0
    def test_distances(self):
        """distances functions as expected
        """
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25], [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.

        expected = [[0, 42.], [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Example #5
0
def reformat_egid(genbank_fp,
                  output_dir):
    """ Reformat input genome to the formats accepted by EGID.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format
    output_dir: string
        output directory path

    Notes
    -----
    Input to EGID are five obsolete NCBI standard files: gbk, fna, faa, ffn
    and ptt.
    """
    (gb, genes) = _merge_genbank_seqs(genbank_fp)
    DNA.write(gb, join(output_dir, 'id.fna'), format='fasta')
    DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
    nucl_seq = str(gb)
    output_f = {}
    for x in ('faa', 'ffn', 'ptt'):
        output_f[x] = open(join(output_dir, 'id.' + x), 'w')
    output_f['ptt'].write('locus001\n' + str(len(genes)) + ' proteins\n')
    # a ptt file contains the following columns:
    fields = ('Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym',
              'Code', 'COG', 'Product')
    output_f['ptt'].write('\t'.join(fields) + '\n')
    gid = 1  # assign an incremental integer to the current gene
    for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]):
        output_f['faa'].write('>' + gene + '\n' + l[0] + '\n')
        output_f['ptt'].write(str(l[1]) + '..' + str(l[2]) + '\t' +
                              l[3] + '\t' + str(len(l[0])) + '\t' +
                              str(gid) + '\t-\tgene' + str(gid) +
                              '\t-\t-\t-\n')
        if l[3] == '+':  # positive strand
            output_f['ffn'].write('>locus001:' + str(l[1]) + '-' +
                                  str(l[2]) + '\n' +
                                  nucl_seq[l[1]-1:l[2]] + '\n')
        else:  # negative strand (reverse complement)
            rc_seq = str(DNA(nucl_seq[l[1]-1:l[2]]).reverse_complement())
            output_f['ffn'].write('>locus001:c' + str(l[2]) + '-' +
                                  str(l[1]) + '\n' + rc_seq + '\n')
        gid += 1
    for x in output_f:
        output_f[x].close()
Example #6
0
    def test_embl_to_dna(self):
        i = 1
        exp = self.multi[i]
        obs = _embl_to_dna(self.multi_fp, seq_num=i+1)
        exp = DNA(exp[0], metadata=exp[1], lowercase=True,
                  interval_metadata=exp[2])

        self.assertEqual(exp, obs)
Example #7
0
def reformat_genemark(genbank_fp, output_dir):
    """ Reformat input genome to the formats accepted by GeneMark.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format
    output_dir: string
        output directory path

    Notes
    -----
    GeneMark's acceptable input file format is FASTA (genome sequence).
    """
    gb = _merge_genbank_seqs(genbank_fp)[0]
    DNA.write(gb, join(output_dir, 'id.fna'), format='fasta')
    DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
Example #8
0
 def test_reverse_transcribe_preserves_all_metadata(self):
     seq = RNA('AGUU',
               metadata={'foo': 'bar'},
               positional_metadata={'foo': range(4)})
     exp = DNA('AGTT',
               metadata={'foo': 'bar'},
               positional_metadata={'foo': range(4)})
     self.assertEqual(seq.reverse_transcribe(), exp)
Example #9
0
def generateReference(assay_list):
    from skbio import DNA
    for assay in assay_list:
        name = assay.name
        if assay.AND:
            for operand in assay.AND:
                if isinstance(operand, Target):
                    name = name + "_%s" % operand.gene_name if operand.gene_name else name
                    for amplicon in operand.amplicons:
                        name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                        seq = DNA(amplicon.sequence, id=name)
                        yield seq
        else:
            for amplicon in assay.target.amplicons:
                name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                seq = DNA(amplicon.sequence, {'id':name})
                yield seq
Example #10
0
    def test_translate_ncbi_table_id(self):
        for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'):
            # default
            obs = seq.translate()
            self.assertEqual(obs, Protein('KFMH'))

            obs = seq.translate(9)
            self.assertEqual(obs, Protein('NFMH'))
Example #11
0
def _find_approx_reverse(args):
    """
    Finds an approximate match for a reverse primer
    """
    [sequence, primer] = args
    primer = DNA(primer)
    align_ = _local_aln(sequence, primer)
    return align_[2][0][0]
Example #12
0
def _find_approx_forward(args):
    """
    Finds an approximate match for a forward primer
    """
    [sequence, primer] = args
    primer = DNA(primer)
    align_ = _local_aln(sequence, primer)
    return align_[2][0][1] + 1
Example #13
0
def gene_distance(A,B):
    '''compute sequence distance between two genes A and B
    '''
    X,Y = '','' # new sequence removing common gaps
    for a,b in izip(A.values,B.values):
        if (a in A.gap_chars) and (b in B.gap_chars):
            continue
        if a in A.degenerate_chars:
            X += random_choice(list(A.degenerate_map[a]))
        else:
            X += a
        if b in B.degenerate_chars:
            Y += random_choice(list(B.degenerate_map[b]))
        else:
            Y += b
    newA = DNA(X,metadata={})
    newB = DNA(Y,metadata={})
    return newA.distance(newB)
Example #14
0
def reformat_genemark(genbank_fp,
                      output_dir):
    """ Reformat input genome to the formats accepted by GeneMark.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format
    output_dir: string
        output directory path

    Notes
    -----
    GeneMark's acceptable input file format is FASTA (genome sequence).
    """
    gb = _merge_genbank_seqs(genbank_fp)[0]
    DNA.write(gb, join(output_dir, 'id.fna'), format='fasta')
    DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
Example #15
0
    def test_constructor_non_empty_no_keys(self):
        # 1x3
        seqs = [DNA('ACG')]
        msa = TabularMSA(seqs)
        self.assertIs(msa.dtype, DNA)
        self.assertEqual(msa.shape, (1, 3))
        with self.assertRaises(OperationError):
            msa.keys
        self.assertEqual(list(msa), seqs)

        # 3x1
        seqs = [DNA('A'), DNA('C'), DNA('G')]
        msa = TabularMSA(seqs)
        self.assertIs(msa.dtype, DNA)
        self.assertEqual(msa.shape, (3, 1))
        with self.assertRaises(OperationError):
            msa.keys
        self.assertEqual(list(msa), seqs)
Example #16
0
    def test_constructor_empty_no_keys(self):
        # sequence empty
        msa = TabularMSA([])
        self.assertIsNone(msa.dtype)
        self.assertEqual(msa.shape, (0, 0))
        with self.assertRaises(OperationError):
            msa.keys
        with self.assertRaises(StopIteration):
            next(iter(msa))

        # position empty
        seqs = [DNA(''), DNA('')]
        msa = TabularMSA(seqs)
        self.assertIs(msa.dtype, DNA)
        self.assertEqual(msa.shape, (2, 0))
        with self.assertRaises(OperationError):
            msa.keys
        self.assertEqual(list(msa), seqs)
Example #17
0
    def setUp(self):
        self.d1 = DNA('GATTACA', metadata={'id': "d1"})
        self.d2 = DNA('TTG', metadata={'id': "d2"})
        self.d3 = DNA('GTATACA', metadata={'id': "d3"})
        self.r1 = RNA('GAUUACA', metadata={'id': "r1"})
        self.r2 = RNA('UUG', metadata={'id': "r2"})
        self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"})

        self.seqs1 = [self.d1, self.d2]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.s1 = SequenceCollection(self.seqs1)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])
Example #18
0
 def test_genbank_to_dna(self):
     i = 1
     exp = self.multi[i]
     obs = _genbank_to_dna(self.multi_fp, seq_num=i + 1)
     exp = DNA(exp[0],
               metadata=exp[1],
               lowercase=True,
               positional_metadata=exp[2])
     self.assertEqual(exp, obs)
Example #19
0
 def setUp(self):
     self.seq_array = pd.Series(
         data=[DNA('AGTC', metadata={'id': 'A'}),
               DNA('ARWS', metadata={'id': 'B'}),
               DNA('CTWK', metadata={'id': 'C'}),
               DNA('GTCM', metadata={'id': 'D'}),
               DNA('ATGN', metadata={'id': 'E'})],
         index=['A', 'B', 'C', 'D', 'E']
     )
     self.seq_array.index = self.seq_array.index.astype(str)
     self.in_mer = pd.Series(
         data=np.array(['AGTCCATGC', 'TACGAGTGA', 
                        'ACTCCATGC', 'AAAAAAAGT'])
     )
     self.reads2 = pd.Series(
         data=np.array(['AGTC', 'WGWN', 'AGTT']), 
         index=['r2.0', 'r2.1', 'r2.2'],
         )
Example #20
0
def reformat_egid(genbank_fp, output_dir):
    """ Reformat input genome to the formats accepted by EGID.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format
    output_dir: string
        output directory path

    Notes
    -----
    Input to EGID are five obsolete NCBI standard files: gbk, fna, faa, ffn
    and ptt.
    """
    (gb, genes) = _merge_genbank_seqs(genbank_fp)
    DNA.write(gb, join(output_dir, 'id.fna'), format='fasta')
    DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
    nucl_seq = str(gb)
    output_f = {}
    for x in ('faa', 'ffn', 'ptt'):
        output_f[x] = open(join(output_dir, 'id.' + x), 'w')
    output_f['ptt'].write('locus001\n' + str(len(genes)) + ' proteins\n')
    # a ptt file contains the following columns:
    fields = ('Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym', 'Code',
              'COG', 'Product')
    output_f['ptt'].write('\t'.join(fields) + '\n')
    gid = 1  # assign an incremental integer to the current gene
    for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]):
        output_f['faa'].write('>' + gene + '\n' + l[0] + '\n')
        output_f['ptt'].write(
            str(l[1]) + '..' + str(l[2]) + '\t' + l[3] + '\t' +
            str(len(l[0])) + '\t' + str(gid) + '\t-\tgene' + str(gid) +
            '\t-\t-\t-\n')
        if l[3] == '+':  # positive strand
            output_f['ffn'].write('>locus001:' + str(l[1]) + '-' + str(l[2]) +
                                  '\n' + nucl_seq[l[1] - 1:l[2]] + '\n')
        else:  # negative strand (reverse complement)
            rc_seq = str(DNA(nucl_seq[l[1] - 1:l[2]]).reverse_complement())
            output_f['ffn'].write('>locus001:c' + str(l[2]) + '-' + str(l[1]) +
                                  '\n' + rc_seq + '\n')
        gid += 1
    for x in output_f:
        output_f[x].close()
Example #21
0
 def test_stockholm_runon_gs(self):
     fp = get_data_path('stockholm_runon_gs_no_whitespace')
     msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
     exp = TabularMSA(
         [DNA('ATCGTTCAGTG', metadata={'LN': 'This is a runon GS line.'})],
         index=['seq1'])
     self.assertEqual(msa, exp)
     fp = get_data_path('stockholm_runon_gs_with_whitespace')
     msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
     self.assertEqual(msa, exp)
Example #22
0
 def test_msa_to_stockholm_minimal(self):
     fp = get_data_path('stockholm_minimal')
     msa = TabularMSA([DNA('TGTGTCGCAGTTGTCGTTTG')], index=['0235244'])
     fh = io.StringIO()
     _tabular_msa_to_stockholm(msa, fh)
     obs = fh.getvalue()
     fh.close()
     with io.open(fp) as fh:
         exp = fh.read()
     self.assertEqual(obs, exp)
Example #23
0
 def test_sort_on_key_with_some_repeats(self):
     msa = TabularMSA([
         DNA('TCCG', metadata={'id': 10}),
         DNA('TAGG', metadata={'id': 10}),
         DNA('GGGG', metadata={'id': 8}),
         DNA('ACGT', metadata={'id': 0}),
         DNA('TAGG', metadata={'id': 10})
     ],
                      keys=range(5))
     msa.sort(key='id')
     self.assertEqual(
         msa,
         TabularMSA([
             DNA('ACGT', metadata={'id': 0}),
             DNA('GGGG', metadata={'id': 8}),
             DNA('TCCG', metadata={'id': 10}),
             DNA('TAGG', metadata={'id': 10}),
             DNA('TAGG', metadata={'id': 10})
         ],
                    keys=[3, 2, 0, 1, 4]))
Example #24
0
    def test_translate_six_frames_ncbi_table_id(self):
        # rc = CAAUUU
        for seq in RNA('AAAUUG'), DNA('AAATTG'):
            # default
            obs = list(seq.translate_six_frames())
            self.assertEqual(obs, [Protein('KL'), Protein('N'), Protein('I'),
                                   Protein('QF'), Protein('N'), Protein('I')])

            obs = list(seq.translate_six_frames(9))
            self.assertEqual(obs, [Protein('NL'), Protein('N'), Protein('I'),
                                   Protein('QF'), Protein('N'), Protein('I')])
Example #25
0
    def test_omit_gap_sequences(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        aln = Alignment([
            DNA('.' * 33, metadata={'id': 'abc'}),
            DNA('-' * 33, metadata={'id': 'def'})
        ])
        self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps),
                         Alignment([]))
Example #26
0
 def test_translate_preserves_metadata(self):
     metadata = {'foo': 'bar', 'baz': 42}
     positional_metadata = {'foo': range(3)}
     for seq in (RNA('AUG', metadata=metadata,
                     positional_metadata=positional_metadata),
                 DNA('ATG', metadata=metadata,
                     positional_metadata=positional_metadata)):
         obs = seq.translate()
         # metadata retained, positional metadata dropped
         self.assertEqual(obs,
                          Protein('M', metadata={'foo': 'bar', 'baz': 42}))
Example #27
0
    def test_reverse_transcribe_preserves_all_metadata(self):
        im = IntervalMetadata(4)
        im.add([(0, 2)], metadata={'gene': 'p53'})

        seq = RNA('AGUU', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        exp = DNA('AGTT', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        self.assertEqual(seq.reverse_transcribe(), exp)
Example #28
0
def remove_gapped_columns(msa, site_threshold=0.95):
    msa_dict = msa.to_dict()
    msa_df = pd.DataFrame(msa_dict)
    gapped_columns = msa_df.apply(gap_dectector, axis=1)
    nogaps_df = msa_df[gapped_columns < len(msa_df.columns) * site_threshold]
    nogap_seqs = [
        DNA(nogaps_df[i].str.decode("utf-8").str.cat(), metadata={"id": i})
        for i in nogaps_df
    ]
    msa_nogap = TabularMSA(nogap_seqs)
    return msa_nogap
Example #29
0
def generateReference(assay_list):
    from skbio import DNA
    from skbio import SequenceCollection
    reference = []
    for assay in assay_list:
        name = assay.name
        if assay.AND:
            for operand in assay.AND:
                if isinstance(operand, Target):
                    name = name + "_%s" % operand.gene_name if operand.gene_name else name
                    for amplicon in operand.amplicons:
                        name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                        seq = DNA(amplicon.sequence, id=name)
                        reference.append(seq)
        else:
            for amplicon in assay.target.amplicons:
                name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                seq = DNA(amplicon.sequence, {'id': name})
                reference.append(seq)
    return SequenceCollection(reference)
Example #30
0
def dnaLocalAlignSsw(seq1, seq2):
    seq1 = seq1.upper()
    seq2 = seq2.upper()

    msa, score, _ = local_pairwise_align_ssw(DNA(seq1), DNA(seq2))

    response = {
        'seq1':
        str(seq1),
        'aln1':
        str(msa[0]),
        'aln2':
        str(msa[1]),
        'score':
        score,
        'similarity':
        float('{:.2f}'.format(msa[0].match_frequency(msa[1], relative=True) *
                              100))
    }

    return response
Example #31
0
 def test_stockholm_runon_gf(self):
     fp = get_data_path('stockholm_runon_gf_no_whitespace')
     msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
     exp = TabularMSA([DNA('ACTGGTTCAATG')],
                      metadata={'CC': 'CBS domains are small intracellular'
                                      ' modules mostly found in 2 or four '
                                      'copies within a protein.'},
                      index=['GG1344'])
     self.assertEqual(msa, exp)
     fp = get_data_path('stockholm_runon_gf_with_whitespace')
     msa = _stockholm_to_tabular_msa(fp, constructor=DNA)
     self.assertEqual(msa, exp)
 def test_translate_six_frames_genetic_code_object(self):
     gc = GeneticCode('M' * 64, '-' * 64)
     for seq in RNA('AAAUUG'), DNA('AAATTG'):
         obs = list(seq.translate_six_frames(gc))
         self.assertEqual(obs, [
             Protein('MM'),
             Protein('M'),
             Protein('M'),
             Protein('MM'),
             Protein('M'),
             Protein('M')
         ])
Example #33
0
    def test_embl_to_gb(self):
        # EMBL records have more features than genbank, (ex more than one date,
        # embl class, DOI cross references) so I can't convert an embl to gb
        # and then to embl keeping all those data. But I can start from
        # genbank record

        # do embl file -> embl object -> gb file -> gb object ->
        # embl file. Ensure that first and last files are identical
        embl = DNA.read(self.single_rna_simple_fp, format="embl")

        # "write" genbank record in a embl file
        with io.StringIO() as fh:
            DNA.write(embl, format="genbank", file=fh)

            # read genbank file
            fh.seek(0)
            genbank = DNA.read(fh, format="genbank")

        # "write" genbank record in a embl file
        with io.StringIO() as fh:
            DNA.write(genbank, format="embl", file=fh)

            # read file object
            obs = fh.getvalue()

        # test objects
        with open(self.single_rna_simple_fp) as fh:
            exp = fh.read()

        self.assertEqual(exp, obs)
Example #34
0
    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc', quality=range(4))
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc', quality=range(4))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc1', quality=range(4)),
            DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1])
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)),
            DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)
Example #35
0
    def test_embl_to_gb(self):
        # EMBL records have more features than genbank, (ex more than one date,
        # embl class, DOI cross references) so I can't convert an embl to gb
        # and then to embl keeping all those data. But I can start from
        # genbank record

        # do embl file -> embl object -> gb file -> gb object ->
        # embl file. Ensure that first and last files are identical
        embl = DNA.read(self.single_rna_simple_fp, format="embl")

        # "write" genbank record in a embl file
        with io.StringIO() as fh:
            DNA.write(embl, format="genbank", file=fh)

            # read genbank file
            fh.seek(0)
            genbank = DNA.read(fh, format="genbank")

        # "write" genbank record in a embl file
        with io.StringIO() as fh:
            DNA.write(genbank, format="embl", file=fh)

            # read file object
            obs = fh.getvalue()

        # test objects
        with open(self.single_rna_simple_fp) as fh:
            exp = fh.read()

        self.assertEqual(exp, obs)
Example #36
0
    def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self):
        # in these tests one sequence is about 3x the length of the other.
        # we toggle penalize_terminal_gaps to confirm that it results in
        # different alignments and alignment scores.
        seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG")
        seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=False)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTTG-------------------------"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 131.0)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=True)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTT-------------------------G"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 97.0)
Example #37
0
    def test_motif_pyrimidine_run(self):
        seq = DNA("")
        self.assertEqual(list(seq.find_motifs("pyrimidine-run")), [])

        seq = DNA("AARC--TCRA")
        self.assertEqual(list(seq.find_motifs("pyrimidine-run")),
                         [slice(3, 4), slice(6, 8)])

        seq = DNA("AA-RC--TCR-A")
        self.assertEqual(list(seq.find_motifs("pyrimidine-run", min_length=3,
                                              ignore=seq.gaps())),
                         [slice(4, 9)])
Example #38
0
    def test_motif_purine_run(self):
        seq = DNA("")
        self.assertEqual(list(seq.find_motifs("purine-run")), [])

        seq = DNA("AARC--TCRG")
        self.assertEqual(list(seq.find_motifs("purine-run")),
                         [slice(0, 3), slice(8, 10)])

        seq = DNA("AA-RC--TCR-G")
        self.assertEqual(list(seq.find_motifs("purine-run", min_length=3,
                                              ignore=seq.gaps())),
                         [slice(0, 4)])
Example #39
0
    def test_gb_to_embl(self):
        genbank = DNA.read(self.genbank_fp, format="genbank")

        with io.StringIO() as fh:
            DNA.write(genbank, format="embl", file=fh)

            # EMBL can't deal with genbank version (ie M14399.1  GI:145229)
            # read embl data and write to gb
            fh.seek(0)
            embl = DNA.read(fh, format="embl")

        with io.StringIO() as fh:
            DNA.write(embl, format="genbank", file=fh)

            # read gb data
            obs = fh.getvalue()

        with open(self.genbank_fp) as fh:
            exp = fh.read()

        self.assertEqual(exp, obs)
Example #40
0
 def test_transcribe_does_not_modify_input(self):
     seq = DNA('ATAT')
     self.assertEqual(seq.transcribe(), RNA('AUAU'))
     self.assertEqual(seq, DNA('ATAT'))
Example #41
0
 def test_transcribe_preserves_all_metadata(self):
     exp = RNA('AGUU', metadata={'foo': 'bar'},
               positional_metadata={'foo': range(4)})
     seq = DNA('AGTT', metadata={'foo': 'bar'},
               positional_metadata={'foo': range(4)})
     self.assertEqual(seq.transcribe(), exp)
from skbio import DNA, read
with open("outfile.fasta", "w") as outfile:
    for seq in read('test_sequences.fasta', format='fasta'):
        new_seq = DNA(seq)
        for protein in new_seq.translate_six_frames():
            if not protein.has_stops():
                outfile.write(">" + str(new_seq.metadata['id']) + "\n" + str(protein) + "\n")
from skbio import DNA
from skbio.alignment import global_pairwise_align_nucleotide

s1 = DNA.read("data/seq1")
s2 = DNA.read("data/seq2")
query = DNA("TTTTCTTGTTGATTCTGGTCCAGAGTAATCGCTTGAGTGTTG")

def pairwise_similarity(seq, query):
    alignment = global_pairwise_align_nucleotide(seq, query)
    return alignment[0].fraction_same(alignment[1])

print "seq1: %s\nseq2: %s" % (s1, s2)
print "seq1-query: %s" % pairwise_similarity(s1, query)
print "seq2-query: %s" % pairwise_similarity(s2, query)
Example #44
0
def _merge_genbank_seqs(genbank_fp):
    """ Merge one to multiple sequences in a GenBank file into one.

    Parameters
    ----------
    genbank_fp: string
        file path to genome in GenBank format

    Returns
    -------
    tuple of (
        skbio.Sequence,
            Genome sequence, genes and metadata
        dict of { list of [ string, int, int, string ] }
            Gene name : translation, start, end, and strand
    )
    """
    loci = []
    nucl_seq = ''
    genes = {}
    nseq = 0  # number of nucleotide sequences
    with open(genbank_fp, 'r') as input_f:
        for line in input_f:
            if line.startswith('//'):
                nseq += 1
    abs_pos = 0  # absolute position in concantenated nucleotide sequence
    for i in range(nseq):
        gb = Sequence.read(genbank_fp, seq_num=i+1, format='genbank')
        locus_name = gb.metadata['LOCUS']['locus_name']
        size = gb.metadata['LOCUS']['size']
        loci.append([locus_name, size])
        nucl_seq += str(gb)
        for feature in gb.interval_metadata._intervals:
            m = feature.metadata
            if m['type'] == 'CDS' and 'protein_id' in m:
                protein_id = m['protein_id'].replace('\"', '')
                if protein_id not in genes:
                    translation = m['translation'].replace(' ', '') \
                        .replace('\"', '')
                    strand = m['strand']
                    start = feature.bounds[0][0] + abs_pos + 1
                    end = feature.bounds[0][1] + abs_pos
                    genes[protein_id] = [translation, start, end, strand]
        abs_pos += int(size)
    gb = DNA(nucl_seq)
    # generate mock metadata for the merged sequence
    gb.metadata['LOCUS'] = {'locus_name': 'locus001', 'size': len(nucl_seq),
                            'unit': 'bp', 'shape': 'circular',
                            'division': 'CON', 'mol_type': 'DNA',
                            'date': '01-JAN-1900'}
    gb.metadata['id'] = 'locus001'
    gid = 1  # assign an incremental integer to the current gene
    gb.interval_metadata._intervals = []
    for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]):
        # generate "gene" and "CDS" records for each protein-coding gene
        location = str(l[1]) + '..' + str(l[2])  # start and end coordinates
        if l[3] == '-':  # negative strand
            location = 'complement(' + location + ')'
        feature = {'type': 'gene', 'locus_tag': 'gene' + str(gid),
                   '__location': location}
        gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature)
        feature = {'type': 'CDS', 'locus_tag': 'gene' + str(gid),
                   '__location': location, 'protein_id': gene,
                   'translation': l[0]}
        gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature)
        gid += 1
    return (gb, genes)