def process_insertion_in_exon(human_seq, spec_seq, ordinal, frame):
    '''
    Annotate insertion. Insertion can be a 'real' insertion, or 
    marked as a frameshift (meaning that it is 1, 2, 4 or 5 
    base pairs long)
    @return: AlignmentExonPiece
    '''
    if len(spec_seq) in (1,2,4,5):
        exon_type = "frameshift"
    else:
        exon_type = "insertion"
        
    al_exon = AlignmentExonPiece(exon_type, ordinal, human_seq, spec_seq)
    al_exon.set_frame(frame)
    return al_exon
def translate_ensembl_exons (ensembl_exons):
    
    cdna = ""
    #determine the frame
    first_exon = ensembl_exons[0]
    if first_exon.relative_start == 0:
        new_frame = first_exon.frame
    else:
        frame_complement = (first_exon.relative_start - first_exon.frame) % 3
        if frame_complement == 1:
            new_frame = 2
        elif frame_complement == 2:
            new_frame = 1
        else:
            new_frame = 0 
        
    #create the cdna
    for ee in ensembl_exons:
        new_sequence = ee.sequence[ee.relative_start:ee.relative_stop]
        cdna += new_sequence
    al_exon = AlignmentExonPiece("ensembl", -1, "", cdna)
    al_exon.set_frame(new_frame)
    return cdna[new_frame:].translate()
def process_insertion_free_region(human_seq, spec_seq, frame, ordinal, alignment_start):
    '''
    Process a region of an alignment without any insertions in the 
    reference species exon. There can exist deletions however. 
    If the deletions are 1 or 2 bases long, they are replaced
    by N (regarded as an assembly error)
    '''
    al_exons = []
    
    spec_seq = re.sub("([ATGCN])(-{1,2})([ATGCN])", replace_slash, spec_seq)
    
    # determine the number of coding sequences interspersed with gaps 
    number_of_ungapped_sequences = len(re.split("-+", str(spec_seq)))
    if number_of_ungapped_sequences == 1:
        al_piece = AlignmentExonPiece("coding", ordinal, human_seq, spec_seq)
        al_piece.set_frame(frame)
        al_piece.set_alignment_locations(alignment_start, alignment_start + len(spec_seq))
        return [al_piece]
    
    # create a pattern to load the sequences
    pattern_string = "([ATGCN]+)"
    for i in range (0, number_of_ungapped_sequences-1):
        pattern_string += "(-+)([ATGCN]+)"
    pattern = re.compile (pattern_string)
    sequences = re.match (pattern, str(spec_seq))
    
    
    # auxiliary variables
    in_coding = True
    start, stop = 0,0
    coding_len = 0
    alignment_stop = alignment_start
    
    for seq in sequences.groups():
        stop += len(seq)
        alignment_stop += len(seq)
        
        species = seq
        human   = human_seq[start:start + len(seq)]
        
        # if we're not in the coding region, there is nothing to process
        # non-coding here means deletion in the referent exon
        if in_coding:
            if coding_len == 0:
                new_frame = frame
            else:
                frame_status = abs(coding_len - frame) % 3
                if frame_status == 1:
                    new_frame = 2
                elif frame_status == 2:
                    new_frame = 1
                else:
                    new_frame = 0
                
            exon = AlignmentExonPiece("coding", ordinal, human, species)
            exon.set_frame(new_frame)
            exon.set_alignment_locations(alignment_start, alignment_stop)
            al_exons.append(exon)
            
        # there is a deletion. Remember this also - if the deletion is
        # one of two bases long, we pad it with Ns and translate regularly    
        else:
            exon = AlignmentExonPiece("deletion", ordinal, human, species)
            al_exons.append(exon)
            
            
        in_coding = not in_coding
        start = stop
        alignment_start = alignment_stop
        coding_len += len(seq)
        ordinal += 1
            
    return al_exons