Example #1
0
 def test_gaps_at_both_ends(self):
     s = 'aaaccggttt'
     s1 = DNA.makeSequence(s[:-2], Name="A")
     s2 = DNA.makeSequence(s[2:], Name="B")
     for a in self._aligned_both_ways(s1, s2, local=False):
         self.assertEqual(matchedColumns(a), 6)
         self.assertEqual(len(a), 10)
Example #2
0
 def test_assemble_seq(self):
     """should correctly fill in a sequence with N's"""
     expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN")
     frags = ["AAAAA","CCCCC","GGG"]
     positions = [(11, 16), (18, 23), (25, 28)]
     self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect)
     positions = [(1, 6), (8, 13), (15, 18)]
     self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect)
     # should work with:
     # start matches first frag start
     expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN")
     positions = [(0, 5), (7, 12), (14, 17)]
     self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect)
     # end matches last frag_end
     expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG")
     positions = [(11, 16), (18, 23), (25, 28)]
     self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect)
     # both start and end matched
     expect = DNA.makeSequence("AAAAANNCCCCCNNGGG")
     positions = [(10, 15), (17, 22), (24, 27)]
     self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect)
     # one frag
     expect = DNA.makeSequence(''.join(frags))
     positions = [(10, 23)]
     self.assertEqual(_assemble_seq([''.join(frags)],10,23,positions),
                             expect)
Example #3
0
 def test_assemble_seq(self):
     """should correctly fill in a sequence with N's"""
     expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN")
     frags = ["AAAAA", "CCCCC", "GGG"]
     positions = [(11, 16), (18, 23), (25, 28)]
     self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect)
     positions = [(1, 6), (8, 13), (15, 18)]
     self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect)
     # should work with:
     # start matches first frag start
     expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN")
     positions = [(0, 5), (7, 12), (14, 17)]
     self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect)
     # end matches last frag_end
     expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG")
     positions = [(11, 16), (18, 23), (25, 28)]
     self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect)
     # both start and end matched
     expect = DNA.makeSequence("AAAAANNCCCCCNNGGG")
     positions = [(10, 15), (17, 22), (24, 27)]
     self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect)
     # one frag
     expect = DNA.makeSequence(''.join(frags))
     positions = [(10, 23)]
     self.assertEqual(_assemble_seq([''.join(frags)], 10, 23, positions),
                      expect)
Example #4
0
 def test_gaps_at_both_ends(self):
     s = 'aaaccggttt'
     s1 = DNA.makeSequence(s[:-2], Name="A")
     s2 = DNA.makeSequence(s[2:], Name="B")
     for a in self._aligned_both_ways(s1, s2, local=False):
         self.assertEqual(matchedColumns(a), 6)
         self.assertEqual(len(a), 10)
Example #5
0
 def test_local_tiebreak(self):
     """Should pick the first best-equal hit rather than the last one"""
     # so that the Pyrex and Python versions give the same result.
     score_matrix = make_dna_scoring_dict(match=1, transition=-1, 
             transversion=-1)
     pattern = DNA.makeSequence('cwc', Name='pattern')
     two_hit = DNA.makeSequence( 'cactc', Name= 'target')
     aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2)
     hit = aln.NamedSeqs['target']
     self.assertEqual(str(hit).lower(), 'cac')
Example #6
0
 def setUp(self):
     self.cigar_text = '3D2M3D6MDM2D3MD'
     self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
     self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
     self.map, self.seq = self.aln_seq.parseOutGaps()
     self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
     self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
     self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
     self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
     self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
Example #7
0
 def setUp(self):
     self.cigar_text = '3D2M3D6MDM2D3MD'
     self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-')
     self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G')
     self.map, self.seq = self.aln_seq.parseOutGaps()
     self.map1, self.seq1 = self.aln_seq1.parseOutGaps()
     self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)]
     self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1})
     self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)}
     self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
Example #8
0
 def test_local_tiebreak(self):
     """Should pick the first best-equal hit rather than the last one"""
     # so that the Pyrex and Python versions give the same result.
     score_matrix = make_dna_scoring_dict(match=1,
                                          transition=-1,
                                          transversion=-1)
     pattern = DNA.makeSequence('cwc', Name='pattern')
     two_hit = DNA.makeSequence('cactc', Name='target')
     aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2)
     hit = aln.NamedSeqs['target']
     self.assertEqual(str(hit).lower(), 'cac')
Example #9
0
 def test_picklability(self):
     """Pickle an alignment containing an annotated sequence"""
     # This depends on alignments, sequences, features, maps and spans
     # Doesn't test round trip result is correct, which should possibly
     # be done for maps/spans, but seqs/alignments are just simple
     # python classes without __getstate__ etc.
     import cPickle as pickle
     seq1 = DNA.makeSequence("aagaagaagaccccca")
     seq2 = DNA.makeSequence("aagaagaagaccccct")
     seq2.addFeature('exon', 'fred', [(10,15)])
     aln = LoadSeqs(data={'a':seq1, 'b':seq2})
     aln2 = pickle.loads(pickle.dumps(aln))
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence("AAAATGCTTA" * r)
    seq1 = DNA.makeSequence("AATTTTGCTG" * r)

    t0 = time.time()
    aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw)
    t = time.time() - t0
    return (len(seq1) * len(seq2)) / t

    print t
Example #11
0
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)

    t0 = time.time()
    aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw)
    t = time.time() - t0
    return (len(seq1) * len(seq2)) / t

    print t
Example #12
0
 def _make_utr_seq(self):
     if self.UntranslatedExons5 is None and self.UntranslatedExons3 is None:
         self._cached["Utr5"] = self.NULL_VALUE
         self._cached["Utr3"] = self.NULL_VALUE
         return
     Utr5_seq, Utr3_seq = DNA.makeSequence(""), DNA.makeSequence("")
     for exon in self.UntranslatedExons5:
         Utr5_seq += exon.Seq
     for exon in self.UntranslatedExons3:
         Utr3_seq += exon.Seq
     self._cached["Utr5"] = Utr5_seq
     self._cached["Utr3"] = Utr3_seq
 def test_picklability(self):
     """Pickle an alignment containing an annotated sequence"""
     # This depends on alignments, sequences, features, maps and spans
     # Doesn't test round trip result is correct, which should possibly
     # be done for maps/spans, but seqs/alignments are just simple
     # python classes without __getstate__ etc.
     import pickle as pickle
     seq1 = DNA.makeSequence("aagaagaagaccccca")
     seq2 = DNA.makeSequence("aagaagaagaccccct")
     seq2.addFeature('exon', 'fred', [(10, 15)])
     aln = LoadSeqs(data={'a': seq1, 'b': seq2})
     aln2 = pickle.loads(pickle.dumps(aln))
Example #14
0
 def test_codon(self):
     s1 = DNA.makeSequence('tacgccgta', Name="A")
     s2 = DNA.makeSequence('tacgta', Name="B")
     codon_model = cogent.evolve.substitution_model.Codon(
                              model_gaps=False, equal_motif_probs=True,
                              mprob_model='conditional')
     tree = cogent.LoadTree(tip_names=['A', 'B'])
     lf = codon_model.makeLikelihoodFunction(tree, aligned=False)
     lf.setSequences(dict(A=s1, B=s2))
     a = lf.getLogLikelihood().edge.getViterbiPath().getAlignment()
     self.assertEqual(matchedColumns(a), 6)
     self.assertEqual(len(a), 9)
Example #15
0
 def test_codon(self):
     s1 = DNA.makeSequence('tacgccgta', Name="A")
     s2 = DNA.makeSequence('tacgta', Name="B")
     codon_model = cogent.evolve.substitution_model.Codon(
         model_gaps=False,
         equal_motif_probs=True,
         mprob_model='conditional')
     tree = cogent.LoadTree(tip_names=['A', 'B'])
     lf = codon_model.makeLikelihoodFunction(tree, aligned=False)
     lf.setSequences(dict(A=s1, B=s2))
     (score, a) = lf.getLogLikelihood().edge.getViterbiScoreAndAlignment()
     self.assertEqual(matchedColumns(a), 6)
     self.assertEqual(len(a), 9)
def test(r=1, **kw):   
    S = make_dna_scoring_dict(10, -1, -8)
    
    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)
    
    t0 = time.clock()
    try:
        # return_alignment is False in order to emphasise the quadratic part of the work.
        aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw)
    except ArithmeticError:
        return '*'
    else:
        t = time.clock() - t0
        return int ( (len(seq1)*len(seq2))/t/1000 )
Example #17
0
    def test_inherit_feature(self):
        """should be able to subclass and extend _Feature"""
        class NewFeat(_Feature):
            def __init__(self, *args, **kwargs):
                super(NewFeat, self).__init__(*args, **kwargs)

            def newMethod(self):
                if len(self.map.spans) > 1:
                    as_one = self.asOneSpan(
                    )  # should create new instance of NewFeat
                    return as_one.newMethod()
                return True

        seq = DNA.makeSequence('ACGTACGTACGT')
        f = seq.addAnnotation(NewFeat,
                              as_map([(1, 3), (5, 7)], len(seq)),
                              type='gene',
                              Name='abcd')
        self.assertEqual(type(f.asOneSpan()), NewFeat)
        self.assertEqual(type(f.getShadow()), NewFeat)
        f2 = seq.addAnnotation(NewFeat,
                               as_map([(3, 5)], len(seq)),
                               type='gene',
                               Name='def')

        self.assertEqual(
            type(seq.getRegionCoveringAll([f, f2], feature_class=NewFeat)),
            NewFeat)
        # now use the new method
        f.newMethod()
Example #18
0
def CigarParser(seqs, cigars, sliced = False, ref_seqname = None, start = None, end = None, moltype=DNA):
    """return an alignment from raw sequences and cigar strings
    if sliced, will return an alignment correspondent to ref sequence start to end
    
    Arguments:
        seqs - raw sequences as {seqname: seq}
        cigars - corresponding cigar text as {seqname: cigar_text}
        cigars and seqs should have the same seqnames
        MolType - optional default to DNA
    """
    data = {}
    if not sliced:
        for seqname in seqs.keys():
            aligned_seq = aligned_from_cigar(cigars[seqname], 
                                            seqs[seqname], moltype=moltype)
            data[seqname] = aligned_seq
    else:
        ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], 
                                        seqs[ref_seqname], moltype=moltype)
        m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align = False)
        data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]]
        for seqname in [seqname for seqname in seqs.keys() if seqname != ref_seqname]:
            m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1])
            if seq_loc:
                seq = seqs[seqname]
                if isinstance(seq, str):
                    seq = moltype.makeSequence(seq)
                data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m)
            else:
                data[seqname] = DNA.makeSequence('-'*(aln_loc[1] - aln_loc[0]))
    aln = LoadSeqs(data = data, aligned = True)
    return aln
def findBestSeq(seqobject):
    dna_seq = str(seqobject.seq)
    my_seq = DNA.makeSequence(dna_seq,seqobject.id)
#    x=0
#    framedict = dict()
#    while x  < 3:
#        temp1 = my_seq[x:]
#        temp2 = temp1..withoutTerminalStopCodon()
#        framedict[x] = temp2.getTranslation()
#        x+=1
    
    all_six = standard_code.sixframes(my_seq)
    seqlist = list()
    for frame in all_six:
        seqreturned = frame.split('*')[0]
        seqlist.append(seqreturned)
    longestseq = ''
    x=0
    while x < 3:
        if len(longestseq) < len(seqlist[x]):
            longestseq = seqlist[x]
            correctdnaseq = my_seq[x:]
        x+=1
    #longest_seq = max(seqlist, key=len)
    return longestseq, correctdnaseq 
Example #20
0
 def _get_flanking_seq_data(self):
     # maps to flanking_sequence through variation_feature_id
     # if this fails, we grab from genomic sequence
     variation_id = self._table_rows['variation_feature']['variation_id']
     flanking_seq_table = self.flanking_sequence_table
     query = sql.select([flanking_seq_table],
                 flanking_seq_table.c.variation_id == variation_id)
     record = asserted_one(query.execute())
     self._table_rows['flanking_sequence'] = record
     up_seq = record['up_seq']
     down_seq = record['down_seq']
     # the following two lines are because -- wait for it -- someone has
     # entered the string 'NULL' instead of NULL in the MySQL tables!!!
     up_seq = [up_seq, None][up_seq == 'NULL']
     down_seq = [down_seq, None][down_seq == 'NULL']
     seqs = dict(up=up_seq, down=down_seq)
     for name, seq in seqs.items():
         if seq is not None:
             seq = DNA.makeSequence(seq)
         else:
             resized = [(-301, -1), (1, 301)][name == 'down']
             if self.Location.Strand == -1:
                 resized = [(1, 301), (-301, -1)][name == 'down']
             flank = self.Location.resized(*resized)
             flanking = self.genome.getRegion(region=flank)
             seq = flanking.Seq
         seqs[name] = seq
     
     self._cached[('FlankingSeq')] = (seqs['up'][-300:],seqs['down'][:300])
Example #21
0
def makeSampleSequence():
    seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc'
    seq = DNA.makeSequence(seq)
    v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20,35)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39,49)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49,60)])
    return seq
Example #22
0
 def test_inherit_feature(self):
     """should be able to subclass and extend _Feature"""
     class NewFeat(_Feature):
         def __init__(self, *args, **kwargs):
             super(NewFeat, self).__init__(*args, **kwargs)
         
         def newMethod(self):
             if len(self.map.spans) > 1:
                 as_one = self.asOneSpan() # should create new instance of NewFeat
                 return as_one.newMethod()
             return True
         
     
     seq = DNA.makeSequence('ACGTACGTACGT')
     f = seq.addAnnotation(NewFeat, as_map([(1,3), (5,7)], len(seq)),
                             type='gene', Name='abcd')
     self.assertEqual(type(f.asOneSpan()), NewFeat)
     self.assertEqual(type(f.getShadow()), NewFeat)
     f2 = seq.addAnnotation(NewFeat, as_map([(3,5)], len(seq)),
                             type='gene', Name='def')
     
     self.assertEqual(type(seq.getRegionCoveringAll([f, f2],
                                             feature_class=NewFeat)),
                     NewFeat)
     # now use the new method
     f.newMethod()
    def _generate_unambiguous_sequences(self):
        unambiguous_conserved_sequences = dict()
        rev_unambiguous_conserved_sequences = dict()
        for pos,seq in self._CONSERVED_SEQUENCES.items():
            dnaseq = DNA.makeSequence(seq)
            ret = self._disambiguate(dnaseq)
            if isinstance(ret, list):
                for dnaseq_r in ret:
                    self.conserved_sequences[str(dnaseq_r)] = ConservedSequence(dnaseq_r, pos)
            else:
                self.conserved_sequences[str(ret)] = ConservedSequence(ret, pos)

        for seq,con_seq in self.conserved_sequences.items():
            rc_seq = DNA.makeSequence(seq)
            rc_seq.rc()
            self.conserved_sequences[str(rc_seq)] = ConservedSequence(rc_seq, con_seq.pos, 
                    rc=True)
Example #24
0
 def test_stop_indexes(self):
     """should return stop codon indexes for a specified frame"""
     sgc = GeneticCode(self.SGC)
     seq = DNA.makeSequence('ATGCTAACATAAA')
     expected = [[9], [4], []]
     for frame, expect in enumerate(expected):
         got = sgc.getStopIndices(seq, start=frame)
         self.assertEqual(got, expect)
Example #25
0
def makeSampleSequence():
    seq = DNA.makeSequence('aaaccggttt' * 10)
    v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit',
                          [(39, 49)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2',
                          [(49, 60)])
    return seq
Example #26
0
 def test_stop_indexes(self):
     """should return stop codon indexes for a specified frame"""
     sgc = GeneticCode(self.SGC)
     seq = DNA.makeSequence("ATGCTAACATAAA")
     expected = [[9], [4], []]
     for frame, expect in enumerate(expected):
         got = sgc.getStopIndices(seq, start=frame)
         self.assertEqual(got, expect)
Example #27
0
def makeSampleSequence():
    seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc'
    seq = DNA.makeSequence(seq)
    v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit',
                          [(39, 49)])
    v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2',
                          [(49, 60)])
    return seq
Example #28
0
def adjust_alignment(template,candidate,new_gaps):
    """adjust template/candidate aln to remove gaps added by pairwise alignment
    
        This step adjusts the alignment to reduce the length back to the 
         template alignment length by introducing local misalignments to
         remove gap characters that are present in the pairwise alignment
         but not in the template alignment.
    
    """
    template_l = list(template)
    candidate_l = list(candidate)
    new_gaps.reverse()
    for pos in new_gaps:
        del template_l[pos]
        del candidate_l[nearest_gap(candidate_l,pos)]
        
    return (DNA.makeSequence(''.join(template_l)), \
            DNA.makeSequence(''.join(candidate_l)))
Example #29
0
def adjust_alignment(template, candidate, new_gaps):
    """adjust template/candidate aln to remove gaps added by pairwise alignment
    
        This step adjusts the alignment to reduce the length back to the 
         template alignment length by introducing local misalignments to
         remove gap characters that are present in the pairwise alignment
         but not in the template alignment.
    
    """
    template_l = list(template)
    candidate_l = list(candidate)
    new_gaps.reverse()
    for pos in new_gaps:
        del template_l[pos]
        del candidate_l[nearest_gap(candidate_l, pos)]

    return (DNA.makeSequence(''.join(template_l)), \
            DNA.makeSequence(''.join(candidate_l)))
Example #30
0
def makeSampleSequence(with_gaps=False):
    raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC'
    cds = (15, 25)
    utr = (12, 15)
    if with_gaps:
        raw_seq = raw_seq[:5] + '-----' +raw_seq[10:-2] + '--'
    seq = DNA.makeSequence(raw_seq)
    seq.addAnnotation(Feature, 'CDS', 'CDS', [cds])
    seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr])
    return seq
Example #31
0
def makeSampleSequence(with_gaps=False):
    raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC'
    cds = (15, 25)
    utr = (12, 15)
    if with_gaps:
        raw_seq = raw_seq[:5] + '-----' + raw_seq[10:-2] + '--'
    seq = DNA.makeSequence(raw_seq)
    seq.addAnnotation(Feature, 'CDS', 'CDS', [cds])
    seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr])
    return seq
Example #32
0
def introduce_terminal_gaps(template,aligned_template,aligned_candidate):
    """ introduce terminal gaps from template into the aligned candidate seq
    """
    
    # count the 5' gaps in the original aligned template
    original_five_prime_gaps = 0
    for c in template:
        if c == '-':
            original_five_prime_gaps +=1
        else:
            break
            
    # count the 5' gaps already existing in the pairwise aligned template
    # (because we don't need to add these)
    aligned_template_five_prime_gaps = 0
    for c in aligned_template:
        if c == '-':
            aligned_template_five_prime_gaps += 1
        else:
            break
            
    # compute the number of 5' gaps that need to be added to get to the
    # original alignment length
    five_prime_gaps_to_add = \
     original_five_prime_gaps - aligned_template_five_prime_gaps
            
    # count the 3' gaps in the original aligned template
    original_three_prime_gaps = 0
    for c in reversed(template):
        if c == '-':
            original_three_prime_gaps +=1
        else:
            break
            
    # count the 3' gaps already existing in the pairwise aligned template
    # (because we don't need to add these)
    aligned_template_three_prime_gaps = 0
    for c in reversed(aligned_template):
        if c == '-':
            aligned_template_three_prime_gaps += 1
        else:
            break
            
    # compute the number of 3' gaps that need to be added to get to the
    # original alignment length
    three_prime_gaps_to_add = \
     original_three_prime_gaps - aligned_template_three_prime_gaps

    # return the sequence with the 5' and 3' gaps added
    return DNA.makeSequence(''.join([\
     '-'*five_prime_gaps_to_add,\
     str(aligned_candidate),\
     '-'*three_prime_gaps_to_add]),\
     Name=aligned_candidate.Name)
Example #33
0
def introduce_terminal_gaps(template, aligned_template, aligned_candidate):
    """ introduce terminal gaps from template into the aligned candidate seq
    """

    # count the 5' gaps in the original aligned template
    original_five_prime_gaps = 0
    for c in template:
        if c == '-':
            original_five_prime_gaps += 1
        else:
            break

    # count the 5' gaps already existing in the pairwise aligned template
    # (because we don't need to add these)
    aligned_template_five_prime_gaps = 0
    for c in aligned_template:
        if c == '-':
            aligned_template_five_prime_gaps += 1
        else:
            break

    # compute the number of 5' gaps that need to be added to get to the
    # original alignment length
    five_prime_gaps_to_add = \
     original_five_prime_gaps - aligned_template_five_prime_gaps

    # count the 3' gaps in the original aligned template
    original_three_prime_gaps = 0
    for c in reversed(template):
        if c == '-':
            original_three_prime_gaps += 1
        else:
            break

    # count the 3' gaps already existing in the pairwise aligned template
    # (because we don't need to add these)
    aligned_template_three_prime_gaps = 0
    for c in reversed(aligned_template):
        if c == '-':
            aligned_template_three_prime_gaps += 1
        else:
            break

    # compute the number of 3' gaps that need to be added to get to the
    # original alignment length
    three_prime_gaps_to_add = \
     original_three_prime_gaps - aligned_template_three_prime_gaps

    # return the sequence with the 5' and 3' gaps added
    return DNA.makeSequence(''.join([\
     '-'*five_prime_gaps_to_add,\
     str(aligned_candidate),\
     '-'*three_prime_gaps_to_add]),\
     Name=aligned_candidate.Name)
Example #34
0
    def test_maps_on_maps(self):
        seq = DNA.makeSequence("ATCGATCGAT" * 5, Name="base")
        feat1 = annotate(seq, 10, 20, "fake")
        feat2 = annotate(feat1, 3, 5, "fake2")
        feat3 = annotate(seq, 1, 3, "left")

        seq2 = seq[5:]
        self.assertEqual(
            structure(seq), ("seq", 50, [("fake", "[10:20]/50", [("fake2", "[3:5]/10")]), ("left", "[1:3]/50")])
        )
        self.assertEqual(structure(seq2), ("seq", 45, [("fake", "[5:15]/45", [("fake2", "[3:5]/10")])]))
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)

    t0 = time.clock()
    try:
        # return_alignment is False in order to emphasise the quadratic part of the work.
        aln = classic_align_pairwise(seq1,
                                     seq2,
                                     S,
                                     10,
                                     2,
                                     local=False,
                                     return_alignment=False,
                                     **kw)
    except ArithmeticError:
        return '*'
    else:
        t = time.clock() - t0
        return int((len(seq1) * len(seq2)) / t / 1000)
Example #36
0
 def _get_sequence(self):
     if 'Seq' not in self._cached:
         try:
             seq = get_sequence(self.Location)
         except NoItemError:
             try:
                 alt_loc = assembly_exception_coordinate(self.Location)
                 seq = get_sequence(alt_loc)
             except NoItemError:
                 seq = DNA.makeSequence("N"*len(self))
         seq.Name = str(self.Location)
         self._cached['Seq'] = seq
     return self._cached['Seq']
Example #37
0
 def _make_aln(self, orig, model=dna_model, param_vals=None, 
         indel_rate=0.1, indel_length=0.5, **kw):
     kw['indel_rate'] = indel_rate
     kw['indel_length'] = indel_length
     seqs = dict((key, DNA.makeSequence(value)) 
             for (key, value) in orig.items())
     if len(seqs) == 2:
         tree = cogent.LoadTree(tip_names=seqs.keys())
         tree = cogent.LoadTree(treestring="(A:.1,B:.1)")
     else:
         tree = cogent.LoadTree(treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)")
     aln, tree = cogent.align.progressive.TreeAlign(model, seqs,
             tree=tree, param_vals=param_vals, show_progress=False, **kw)
     return aln
Example #38
0
    def test_maps_on_maps(self):
        seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base')
        feat1 = annotate(seq, 10, 20, 'fake')
        feat2 = annotate(feat1, 3, 5, 'fake2')
        feat3 = annotate(seq, 1, 3, 'left')

        seq2 = seq[5:]
        self.assertEqual(
            structure(seq),
            ('seq', 50, [('fake', '[10:20]/50', [('fake2', '[3:5]/10')]),
                         ('left', '[1:3]/50')]))
        self.assertEqual(
            structure(seq2),
            ('seq', 45, [('fake', '[5:15]/45', [('fake2', '[3:5]/10')])]))
Example #39
0
def makeSampleSequence(mid_gaps=False):
    raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC'
    cds = (15, 25)
    utr = (12, 15)
    if mid_gaps:
        rev_seq = raw_seq[:5] + '-----' +raw_seq[10:]
        raw_seq = rev_seq
        # annotations only make sense when they're on the raw sequence
        cds = (10, 20)
        utr = (5, 8)
    seq = DNA.makeSequence(raw_seq)
    seq.addAnnotation(Feature, 'CDS', 'CDS', [cds])
    seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr])
    return seq
Example #40
0
def remove_template_terminal_gaps(candidate,template):
    """Remove template terminal gaps and corresponding bases in candidate 
    """
    if len(template) != len(candidate):
        raise ValueError, \
         "Sequences must be aligned, but their "+\
         "lengths aren't equal. %d != %d" % (len(candidate),len(template))
         
    if len(template) == 0:
        return candidate, template
    
    degapped_candidate_len = len(candidate.degap())
    
    candidate = DNA.makeSequence(candidate)
    template = DNA.makeSequence(template)
    
    template_gap_vector = template.gapVector()
    first_non_gap = template_gap_vector.index(False)
    num_three_prime_gaps = template_gap_vector[::-1].index(False)
    last_non_gap = len(template_gap_vector) - num_three_prime_gaps
    
    # Construct the candidate name, which will include the range of bases
    # from the original sequence
    candidate = candidate[first_non_gap:last_non_gap]
    template = template[first_non_gap:last_non_gap]
    candidate_start_pos = first_non_gap + 1
    candidate_end_pos = degapped_candidate_len - num_three_prime_gaps
    candidate_name = candidate.Name
    if candidate_name.endswith('RC'):
        name_delimiter = ':'
    else:
        name_delimiter = ' '
    candidate_name = '%s%s%d..%d' %\
     (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos)
    
    return DNA.makeSequence(candidate,Name=candidate_name), template
Example #41
0
def remove_template_terminal_gaps(candidate, template):
    """Remove template terminal gaps and corresponding bases in candidate 
    """
    if len(template) != len(candidate):
        raise ValueError, \
         "Sequences must be aligned, but their "+\
         "lengths aren't equal. %d != %d" % (len(candidate),len(template))

    if len(template) == 0:
        return candidate, template

    degapped_candidate_len = len(candidate.degap())

    candidate = DNA.makeSequence(candidate)
    template = DNA.makeSequence(template)

    template_gap_vector = template.gapVector()
    first_non_gap = template_gap_vector.index(False)
    num_three_prime_gaps = template_gap_vector[::-1].index(False)
    last_non_gap = len(template_gap_vector) - num_three_prime_gaps

    # Construct the candidate name, which will include the range of bases
    # from the original sequence
    candidate = candidate[first_non_gap:last_non_gap]
    template = template[first_non_gap:last_non_gap]
    candidate_start_pos = first_non_gap + 1
    candidate_end_pos = degapped_candidate_len - num_three_prime_gaps
    candidate_name = candidate.Name
    if candidate_name.endswith('RC'):
        name_delimiter = ':'
    else:
        name_delimiter = ' '
    candidate_name = '%s%s%d..%d' %\
     (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos)

    return DNA.makeSequence(candidate, Name=candidate_name), template
Example #42
0
def _assemble_seq(frags, start, end, frag_positions):
    """returns a single string in which missing sequence is replaced by 'N'"""
    prev_end = start
    assert len(frag_positions) == len(frags), "Mismatched number of "\
                                                    "fragments and positions"
    assembled = []
    for index, (frag_start, frag_end) in enumerate(frag_positions):
        diff = frag_start - prev_end
        assert diff >= 0, 'fragment position start < previous end: %s, %s' %\
                                                (frag_start, prev_end)
        assembled += ['N'*diff, frags[index]]
        prev_end = frag_end
    diff = end - frag_end
    assert diff >= 0, 'end[%s] < previous frag_end[%s]' % (end, frag_end)
    assembled += ['N' * diff]
    return DNA.makeSequence(''.join(assembled))
 def test_simulateAlignment_root_sequence(self):
     """provide a root sequence for simulating an alignment"""
     def use_root_seq(root_sequence):
         al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'})
         t = LoadTree(treestring="(a,c);")
         sm = substitution_model.Dinucleotide(mprob_model='tuple')
         lf = sm.makeParamController(t)
         lf.setAlignment(al)
         simalign = lf.simulateAlignment(exclude_internal=False,
                                         root_sequence=root_sequence)
         root = simalign.NamedSeqs['root']
         self.assertEqual(str(root), str(root_sequence))
     
     root_sequence = DNA.makeSequence('GTAATT')
     use_root_seq(root_sequence) # as a sequence instance
     use_root_seq('GTAATC') # as a string
Example #44
0
def _assemble_seq(frags, start, end, frag_positions):
    """returns a single string in which missing sequence is replaced by 'N'"""
    prev_end = start
    assert len(frag_positions) == len(frags), "Mismatched number of "\
                                                    "fragments and positions"
    assembled = []
    for index, (frag_start, frag_end) in enumerate(frag_positions):
        diff = frag_start - prev_end
        assert diff >= 0, 'fragment position start < previous end: %s, %s' %\
                                                (frag_start, prev_end)
        assembled += ['N' * diff, frags[index]]
        prev_end = frag_end
    diff = end - frag_end
    assert diff >= 0, 'end[%s] < previous frag_end[%s]' % (end, frag_end)
    assembled += ['N' * diff]
    return DNA.makeSequence(''.join(assembled))
Example #45
0
 def test_maps_on_maps(self):
     seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base')
     feat1 = annotate(seq, 10, 20, 'fake')
     feat2 = annotate(feat1, 3, 5, 'fake2')
     feat3 = annotate(seq, 1, 3, 'left')
     
     seq2 = seq[5:]
     self.assertEqual(structure(seq), ('seq', 50,
         [('fake', '[10:20]/50',
             [('fake2', '[3:5]/10')]),
         ('left', '[1:3]/50')])
         )
     self.assertEqual(structure(seq2), ('seq', 45,
         [('fake', '[5:15]/45',
             [('fake2', '[3:5]/10')])])
         )
Example #46
0
    def test_simulateAlignment_root_sequence(self):
        """provide a root sequence for simulating an alignment"""
        def use_root_seq(root_sequence):
            al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'})
            t = LoadTree(treestring="(a,c);")
            sm = substitution_model.Dinucleotide(mprob_model='tuple')
            lf = sm.makeParamController(t)
            lf.setAlignment(al)
            simalign = lf.simulateAlignment(exclude_internal=False,
                                            root_sequence=root_sequence)
            root = simalign.NamedSeqs['root']
            self.assertEqual(str(root), str(root_sequence))

        root_sequence = DNA.makeSequence('GTAATT')
        use_root_seq(root_sequence)  # as a sequence instance
        use_root_seq('GTAATC')  # as a string
Example #47
0
def CigarParser(seqs,
                cigars,
                sliced=False,
                ref_seqname=None,
                start=None,
                end=None,
                moltype=DNA):
    """return an alignment from raw sequences and cigar strings
    if sliced, will return an alignment correspondent to ref sequence start to end
    
    Arguments:
        seqs - raw sequences as {seqname: seq}
        cigars - corresponding cigar text as {seqname: cigar_text}
        cigars and seqs should have the same seqnames
        MolType - optional default to DNA
    """
    data = {}
    if not sliced:
        for seqname in list(seqs.keys()):
            aligned_seq = aligned_from_cigar(cigars[seqname],
                                             seqs[seqname],
                                             moltype=moltype)
            data[seqname] = aligned_seq
    else:
        ref_aln_seq = aligned_from_cigar(cigars[ref_seqname],
                                         seqs[ref_seqname],
                                         moltype=moltype)
        m, aln_loc = slice_cigar(cigars[ref_seqname],
                                 start,
                                 end,
                                 by_align=False)
        data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]]
        for seqname in [
                seqname for seqname in list(seqs.keys())
                if seqname != ref_seqname
        ]:
            m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1])
            if seq_loc:
                seq = seqs[seqname]
                if isinstance(seq, str):
                    seq = moltype.makeSequence(seq)
                data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m)
            else:
                data[seqname] = DNA.makeSequence('-' *
                                                 (aln_loc[1] - aln_loc[0]))
    aln = LoadSeqs(data=data, aligned=True)
    return aln
Example #48
0
    def test_other_repeat(self):
        """should apply repeat feature data in a manner consistent with strand"""
        coord = dict(CoordName=13, Start=32890200, End=32890500)
        ps_repeat = self.human.getRegion(Strand=1, **coord)
        ms_repeat = self.human.getRegion(Strand=-1, **coord)
        exp = DNA.makeSequence('CTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTG'\
          'TCCAAACCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGA'\
          'TTTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG')

        self.assertEquals(ms_repeat.Seq, ps_repeat.Seq.rc())

        ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat')
        ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat')
        ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0]
        ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0]
        self.assertEquals(ms_seq.getSlice(), ps_seq.getSlice())
        self.assertEquals(ps_seq.getSlice(), exp)
Example #49
0
    def test_other_repeat(self):
        """should apply repeat feature data in a manner consistent with strand"""
        coord=dict(CoordName=13, Start=32890200, End=32890500)
        ps_repeat = self.human.getRegion(Strand=1, **coord)
        ms_repeat = self.human.getRegion(Strand=-1, **coord)
        exp = DNA.makeSequence('CTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTG'\
          'TCCAAACCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGA'\
          'TTTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG')

        self.assertEquals(ms_repeat.Seq, ps_repeat.Seq.rc())

        ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat')
        ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat')
        ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0]
        ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0]
        self.assertEquals(ms_seq.getSlice(), ps_seq.getSlice())
        self.assertEquals(ps_seq.getSlice(), exp)
Example #50
0
    def test_getByAnnotation(self):
        seq = DNA.makeSequence("ATCGATCGAT" * 5, Name="base")
        seq.addAnnotation(Feature, "test_type", "test_label", [(5, 10)])
        seq.addAnnotation(Feature, "test_type", "test_label2", [(15, 18)])

        answer = list(seq.getByAnnotation("test_type"))
        self.assertEqual(len(answer), 2)
        self.assertEqual(str(answer[0]), "TCGAT")
        self.assertEqual(str(answer[1]), "TCG")

        answer = list(seq.getByAnnotation("test_type", "test_label"))
        self.assertEqual(len(answer), 1)
        self.assertEqual(str(answer[0]), "TCGAT")

        # test ignoring of a partial annotation
        sliced_seq = seq[:17]
        answer = list(sliced_seq.getByAnnotation("test_type", ignore_partial=True))
        self.assertEqual(len(answer), 1)
        self.assertEqual(str(answer[0]), "TCGAT")
Example #51
0
    def test_other_repeat(self):
        """should apply repeat feature data in a manner consistent with strand"""
        coord = dict(CoordName=13, Start=32316063, End=32316363)
        # 13:32316063 -32316363
        ps_repeat = self.human.getRegion(Strand=1, **coord)
        ms_repeat = self.human.getRegion(Strand=-1, **coord)
        # note this MER3 repeat is annotated on the -1 strand
        exp = DNA.makeSequence('AGCTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTGTCCAAA'\
                'CCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGAT'\
                'TTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG')

        self.assertEqual(ms_repeat.Seq, ps_repeat.Seq.rc())

        ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat')
        ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat')
        ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0]
        ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0]
        self.assertEqual(ms_seq.getSlice(), ps_seq.getSlice())
        self.assertEqual(ps_seq.getSlice(), exp)
Example #52
0
 def test_getByAnnotation(self):
     seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base')
     seq.addAnnotation(Feature, 'test_type', 'test_label', [(5,10)])
     seq.addAnnotation(Feature, 'test_type', 'test_label2', [(15,18)])
     
     answer = list(seq.getByAnnotation('test_type'))
     self.assertEqual( len(answer), 2)
     self.assertEqual( str(answer[0]), 'TCGAT')
     self.assertEqual( str(answer[1]), 'TCG')
     
     answer = list(seq.getByAnnotation('test_type', 'test_label'))
     self.assertEqual( len(answer), 1)
     self.assertEqual( str(answer[0]), 'TCGAT')
     
     # test ignoring of a partial annotation
     sliced_seq = seq[:17]
     answer = list(sliced_seq.getByAnnotation('test_type', ignore_partial=True))
     self.assertEqual(len(answer), 1)
     self.assertEqual( str(answer[0]), 'TCGAT')
Example #53
0
 def test_getByAnnotation(self):
     seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base')
     seq.addAnnotation(Feature, 'test_type', 'test_label', [(5,10)])
     seq.addAnnotation(Feature, 'test_type', 'test_label2', [(15,18)])
     
     answer = list(seq.getByAnnotation('test_type'))
     self.assertEqual( len(answer), 2)
     self.assertEqual( str(answer[0]), 'TCGAT')
     self.assertEqual( str(answer[1]), 'TCG')
     
     answer = list(seq.getByAnnotation('test_type', 'test_label'))
     self.assertEqual( len(answer), 1)
     self.assertEqual( str(answer[0]), 'TCGAT')
     
     # test ignoring of a partial annotation
     sliced_seq = seq[:17]
     answer = list(sliced_seq.getByAnnotation('test_type', ignore_partial=True))
     self.assertEqual(len(answer), 1)
     self.assertEqual( str(answer[0]), 'TCGAT')
Example #54
0
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False):
    # TODO clean up use of a coord
    genome = coord.genome
    # no matter what strand user provide, we get the + sequence first
    coord.Strand = 1
    species = genome.Species
    coord_type = CoordSystem(species=species,
                             core_db=genome.CoreDb,
                             seq_level=True)

    if DEBUG:
        print('Created Coordinate:', coord, coord.EnsemblStart,
              coord.EnsemblEnd)
        print(coord.CoordType, coord_type)

    assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb)

    if not assemblies:
        raise NoItemError('no assembly for %s' % coord)

    dna = genome.CoreDb.getTable('dna')
    seqs, positions = [], []
    for q_loc, t_loc in assemblies:
        assert q_loc.Strand == 1
        length = len(t_loc)
        # get MySQL to do the string slicing via substr function
        query = sql.select([
            substr(dna.c.sequence, t_loc.EnsemblStart,
                   length).label('sequence')
        ], dna.c.seq_region_id == t_loc.seq_region_id)
        record = asserted_one(query.execute().fetchall())
        seq = record['sequence']
        seq = DNA.makeSequence(seq)
        if t_loc.Strand == -1:
            seq = seq.rc()
        seqs.append(str(seq))
        positions.append((q_loc.Start, q_loc.End))
    sequence = _assemble_seq(seqs, coord.Start, coord.End, positions)
    return sequence
Example #55
0
 def _make_aln(self,
               orig,
               model=dna_model,
               param_vals=None,
               indel_rate=0.1,
               indel_length=0.5,
               **kw):
     kw['indel_rate'] = indel_rate
     kw['indel_length'] = indel_length
     seqs = dict(
         (key, DNA.makeSequence(value)) for (key, value) in orig.items())
     if len(seqs) == 2:
         tree = cogent.LoadTree(tip_names=seqs.keys())
         tree = cogent.LoadTree(treestring="(A:.1,B:.1)")
     else:
         tree = cogent.LoadTree(
             treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)")
     aln, tree = cogent.align.progressive.TreeAlign(model,
                                                    seqs,
                                                    tree=tree,
                                                    show_progress=False,
                                                    param_vals=param_vals,
                                                    **kw)
     return aln
def get_primers(primers_data=None, 
                primer_name=None,
                primer_sequence=None):
    """ Gets primers from filepath or from single specified primer name/seq
    
     primers_data: If specified, open file object for primers data
     primer_name: single primer name to analyze
     primer_sequence: single primer sequence to analyze
     
     If only the primers_data is provided, all primers in the file will
     be read and appended to a list as PyCogent DNA.Sequence objects.  If
     a single primer name is provided along with the primers_data, but no
     primer_sequence, then the primers list will be populated by the single
     primer and its sequence in the provided primers_data (if not found an
     error will be raised).  If both a primer_name and primer_sequence are
     provided, only a single primer will be constructed from these, and any
     primers file data passed will be ignored.
     """
     
    # User must specify a primers filepath, or a primer_name and 
    # primer_sequence, error check for this.
    if not primers_data and not(primer_name and primer_sequence):
        raise ValueError,("Missing primer(s) data.  User must specify either "+\
         "a primers filepath, or a primer name and sequence.  See the -P, -p,"+\
         " and -s parameters.") 
    
    primers = []
            
    # Check for correct naming convention of single primer specified
    if primer_name:
        
        # Fix primer name if not in correct format (followed by lower case f or
        # r.  Leave other components unchanged.
        primer_name = correct_primer_name(primer_name)
        # Check primer name for proper 'r' or 'f' ending
        if not (primer_name.split('_')[0].endswith('f') or 
         primer_name.split('_')[0].endswith('r')):
            raise ValueError, ('Primer name %s ' % primer_name +'does not '+\
             'end with "f" or "r".  The initial alphanumeric name of the '+\
             'primer must be followed by "f" or "r".  Example: 22f_archaeal')
    
    # If both primer name and seq provided, return single DNA sequence object
    # for that primer
    if primer_name and primer_sequence:
        primers.append(DNA.makeSequence(primer_sequence, Name=primer_name))
        return primers
    
    
    # Parse out primers data from formatted primers file, returns list
    # of tuples with (primer name, primer seq)
    raw_primers = parse_formatted_primers_data(primers_data)
    # Test all primer names for proper suffix of 'f' or 'r'
    for p in raw_primers:
        if not(p[0].split('_')[0].endswith('f') or
         p[0].split('_')[0].endswith('r')):
            raise ValueError,('Primer %s ' % p[0] +'does not end '+\
             'with "f" or "r".  The initial alphanumeric name of the '+\
             'primer must be followed by "f" or "r".  Example: 22f_archaeal')
    
    # If primer_name provided, return single DNA.Sequence object with that
    # particular primer name and sequence from the primers file
    if primer_name:
        # Search raw_primers for primer name that matches one provided
        for p in raw_primers:
            if p[0] == primer_name:
                primers.append(DNA.makeSequence(p[1], Name=primer_name))
                return primers
        # If primer name not found, raise value error
        raise ValueError,('Primer %s ' % primer_name +'not found in input '+\
         'primers file, please add to primers file or specify sequence with '+\
         'the -s parameter.')
         
    # If not using a single primer, build all primers in input primers file
    for p in raw_primers:
        primers.append(DNA.makeSequence(p[1], p[0]))
        
    # Raise error if nothing built from input file
    if not(primers):
        raise ValueError,('No primers were read from input primers file, '+\
         'please check file format.')
    
    return primers
Example #57
0
 def test_short(self):
     s1 = DNA.makeSequence('tacagta', Name="A")
     s2 = DNA.makeSequence('tacgtc', Name="B")
     for a in self._aligned_both_ways(s1, s2, local=False):
         self.assertEqual(matchedColumns(a), 5)
         self.assertEqual(len(a), 7)
Example #58
0
def assign_dna_reads_to_protein_database(query_fasta_fp,
                                         database_fasta_fp,
                                         output_fp,
                                         temp_dir="/tmp",
                                         params=None):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'prot', '-q': 'prot'}

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError("temp_dir must be an absolute path.")

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError(
            "Cannot change database or query types "
            "when using assign_dna_reads_to_dna_database. Use "
            "assign_reads_to_database instead.")

    if 'genetic_code' in params:
        my_genetic_code = GeneticCodes[params['genetic_code']]
        del params['genetic_code']
    else:
        my_genetic_code = GeneticCodes[1]

    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = my_genetic_code.sixframes(s)
        frames = [1, 2, 3, -1, -2, -3]
        translations = dict(zip(frames, translations))

        for frame, translation in sorted(translations.iteritems()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp,
                                      database_fasta_fp,
                                      output_fp,
                                      params=my_params)

    remove(tmp)

    return result
Example #59
0
def matchedColumns(align):
    """Count the matched columns in an alignment"""
    def all_same(column):
        consensus = None
        for motif in column:
            if consensus is None:
                consensus = motif
            elif motif != consensus:
                return False
        return True

    return len(align.filtered(all_same))


seq1 = DNA.makeSequence('aaaccggacattacgtgcgta', Name='FAKE01')
seq2 = DNA.makeSequence('ccggtcaggttacgtacgtt', Name='FAKE02')


class AlignmentTestCase(unittest.TestCase):
    def _aligned_both_ways(self, seq1, seq2, **kw):
        S = make_dna_scoring_dict(10, -1, -8)
        a1 = classic_align_pairwise(seq1, seq2, S, 10, 2, **kw)
        a2 = classic_align_pairwise(seq2, seq1, S, 10, 2, **kw)
        return [a1, a2]

    def test_local(self):
        for a in self._aligned_both_ways(seq1, seq2, local=True):
            self.assertEqual(matchedColumns(a), 15)
            self.assertEqual(len(a), 19)