def unique_contiguous_region_filter(sequence):
        '''It filters out the snv in regions repeated in the genome or
        discontiguous'''
        if sequence is None:
            return None

        for snv in sequence.get_features(kind='snv'):
            # Check if it is already done
            previous_result = _get_filter_result(snv, 'uniq_contiguous',
                                                 threshold=distance)
            if previous_result is not None:
                continue

            #we make a blast
            #with the sequence around the snv
            location = snv.location.start.position
            start = location - distance
            end = location + distance
            if start < 0:
                start = 0
            #print start, end
            seq_fragment = sequence[start:end]
            blast_fhand = blast_runner(seq_fragment)['blastn']
            #now we parse the blast
            blast_result = blast_parser(blast_fhand)
            alignments = filter_alignments(blast_result, config=match_filters)
            #are there any similar sequences?
            try:
                alignment = alignments.next()
                result = True
            except StopIteration:
                #if there is no similar sequence we assume that is unique
                result = False
            if result:
                #how many matches, it should be only one
                num_hits = len(alignment['matches'])

                if num_hits > 1:
                    result = True
                else:
                    #how many match parts have the first match?
                    #we could do it with the blast result, but blast is not very
                    #good aligning, so we realign with est2genome
                    blast_fhand.seek(0)
                    sim_seqs = similar_sequences_for_blast(blast_fhand)
                    sim_seq = sim_seqs[0] if sim_seqs else None

                    introns = infer_introns_for_cdna(sequence=seq_fragment,
                                          genomic_seqs_index=genomic_seqs_index,
                                              similar_sequence=sim_seq,
                                              genomic_db=genomic_db)
                    if introns:
                        result = True
                    else:
                        result = False

            blast_fhand.close()
            _add_filter_result(snv, 'uniq_contiguous', result, distance)
        return sequence
 def annotate_intron(sequence):
     'It adds the orf to the SeqFeatures'
     if sequence is None:
         return
     try:
         introns = infer_introns_for_cdna(sequence=sequence,
                                       genomic_db=genomic_db,
                                       genomic_seqs_index=genomic_seqs_index)
     except KeyError as error:
         error = str(error).lstrip('u').strip("'")
         if 'not found' in error:
             error += ' in seq file %s, but present in blast db %s' % \
                                        (genomic_seqs_fhand.name, genomic_db)
         raise RuntimeError(error)
     for intron_pos in introns:
         feature = SeqFeature(location=FeatureLocation(intron_pos,
                                                       intron_pos),
                              type='intron',
                              qualifiers={'genomic_db':genomic_db})
         sequence.features.append(feature)
     return sequence
Example #3
0
    def test_infer_introns_est2genome_method():
        'It tests the est2genome method of infering introns'
        seq  = 'GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC'
        seq += 'AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG'
        seq += 'GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA'
        seq += 'GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC'
        seq += 'TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG'
        seq += 'TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG'
        seq += 'CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA'
        seq += 'AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG'
        seq += 'TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT'
        seq += 'GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG'
        seq += 'TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT'
        seq1 = SeqWithQuality(seq = Seq(seq))
        tomato_genome = 'tomato_genome2+'

        genomic_db = os.path.join(TEST_DATA_DIR, 'blast', tomato_genome)
        genomic_seqs_index = SeqIO.index(genomic_db, 'fasta')
        introns = infer_introns_for_cdna(seq1, genomic_db,
                                         genomic_seqs_index=genomic_seqs_index)
        assert introns == [478, 572, 613]