def process_insertion_in_exon(human_seq, spec_seq, ordinal, frame): ''' Annotate insertion. Insertion can be a 'real' insertion, or marked as a frameshift (meaning that it is 1, 2, 4 or 5 base pairs long) @return: AlignmentExonPiece ''' if len(spec_seq) in (1,2,4,5): exon_type = "frameshift" else: exon_type = "insertion" al_exon = AlignmentExonPiece(exon_type, ordinal, human_seq, spec_seq) al_exon.set_frame(frame) return al_exon
def translate_ensembl_exons (ensembl_exons): cdna = "" #determine the frame first_exon = ensembl_exons[0] if first_exon.relative_start == 0: new_frame = first_exon.frame else: frame_complement = (first_exon.relative_start - first_exon.frame) % 3 if frame_complement == 1: new_frame = 2 elif frame_complement == 2: new_frame = 1 else: new_frame = 0 #create the cdna for ee in ensembl_exons: new_sequence = ee.sequence[ee.relative_start:ee.relative_stop] cdna += new_sequence al_exon = AlignmentExonPiece("ensembl", -1, "", cdna) al_exon.set_frame(new_frame) return cdna[new_frame:].translate()
def process_insertion_free_region(human_seq, spec_seq, frame, ordinal, alignment_start): ''' Process a region of an alignment without any insertions in the reference species exon. There can exist deletions however. If the deletions are 1 or 2 bases long, they are replaced by N (regarded as an assembly error) ''' al_exons = [] spec_seq = re.sub("([ATGCN])(-{1,2})([ATGCN])", replace_slash, spec_seq) # determine the number of coding sequences interspersed with gaps number_of_ungapped_sequences = len(re.split("-+", str(spec_seq))) if number_of_ungapped_sequences == 1: al_piece = AlignmentExonPiece("coding", ordinal, human_seq, spec_seq) al_piece.set_frame(frame) al_piece.set_alignment_locations(alignment_start, alignment_start + len(spec_seq)) return [al_piece] # create a pattern to load the sequences pattern_string = "([ATGCN]+)" for i in range (0, number_of_ungapped_sequences-1): pattern_string += "(-+)([ATGCN]+)" pattern = re.compile (pattern_string) sequences = re.match (pattern, str(spec_seq)) # auxiliary variables in_coding = True start, stop = 0,0 coding_len = 0 alignment_stop = alignment_start for seq in sequences.groups(): stop += len(seq) alignment_stop += len(seq) species = seq human = human_seq[start:start + len(seq)] # if we're not in the coding region, there is nothing to process # non-coding here means deletion in the referent exon if in_coding: if coding_len == 0: new_frame = frame else: frame_status = abs(coding_len - frame) % 3 if frame_status == 1: new_frame = 2 elif frame_status == 2: new_frame = 1 else: new_frame = 0 exon = AlignmentExonPiece("coding", ordinal, human, species) exon.set_frame(new_frame) exon.set_alignment_locations(alignment_start, alignment_stop) al_exons.append(exon) # there is a deletion. Remember this also - if the deletion is # one of two bases long, we pad it with Ns and translate regularly else: exon = AlignmentExonPiece("deletion", ordinal, human, species) al_exons.append(exon) in_coding = not in_coding start = stop alignment_start = alignment_stop coding_len += len(seq) ordinal += 1 return al_exons