def call_annotation_variant(annotation_file, ref_aligned, seq_aligned, ref_positions, seq_positions, sequence_id = 666): table = CodonTable.ambiguous_dna_by_id[1] list_annotations = [] class Ann: def __init__(self, ann_type, ann_pos, gene, protein, protein_id, aa_seq): self.ann_type = ann_type self.ann_pos = ann_pos self.gene = gene self.protein = protein self.protein_id = protein_id self.aa_seq = aa_seq def parse_pos(l): return [(int(pos.strip().split(",")[0]), int(pos.strip().split(",")[1])) for pos in l.strip().split(";")] ref_annotations = [] with open(annotation_file) as f: for line in f: s = line.strip().split("\t") ann_type = s[2] ann_pos = parse_pos(s[3]) gene = None if s[4] == "." else s[4] protein = None if s[5] == "." else s[5] protein_id = None if s[6] == "." else s[6] aa_seq = None if s[7] == "." else s[7] ref_annotations.append(Ann(ann_type, ann_pos, gene, protein, protein_id, aa_seq)) proteins_not_multiple_of_3 = [] for annotation in ref_annotations: gene = annotation.gene protein = annotation.protein protein_id = annotation.protein_id atype = annotation.ann_type nuc_start = annotation.ann_pos[0][0] nuc_stop = annotation.ann_pos[-1][1] # get the nucleotide sequence nuc_seq = "".join( [x[1] for x in zip(ref_positions, seq_aligned) if nuc_start <= x[0] <= nuc_stop]).replace("-", "") list_mutations = [] if annotation.ann_type == 'mature_protein_region' or annotation.ann_type == 'CDS': # dna_ref is the concatenation of the nucleotides of the aligned seq within the range(s) of this protein # with gaps deleted. This string is what is translated in the cell for this protein. dna_ref = '' for (start, stop) in annotation.ann_pos: dna_ref += "".join([x[1] for x in zip(ref_positions, seq_aligned) if start <= x[0] <= stop]).replace("-", "") if len(dna_ref) % 3 == 0 and len(dna_ref) > 0: aa_seq_with_symbols = Seq._translate_str(dna_ref, table, cds=False) # symbols e.g. * that means Ter aa_seq = aa_seq_with_symbols.replace("*", "") # annotation.aa_seq is the reference AA sequence from the annotation files for this protein alignment_aa = pairwise2.align.globalms(annotation.aa_seq, aa_seq, 3, -1, -3, -1) try: ref_aligned_aa = alignment_aa[0][0] seq_aligned_aa = alignment_aa[0][1] except IndexError: continue ref_positions_aa = np.zeros(len(seq_aligned_aa), dtype=int) pos = 0 for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] != '-': pos += 1 ref_positions_aa[i] = pos seq_positions_aa = np.zeros(len(seq_aligned_aa), dtype=int) pos = 0 for i in range(len(ref_aligned_aa)): if seq_aligned_aa[i] != '-': pos += 1 seq_positions_aa[i] = pos list_mutations = [] ins_open = False ins_len = 0 ins_pos = None ins_seq = "" for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] == '-': ins_open = True ins_len += 1 ins_pos = ref_positions_aa[i] ins_seq += seq_aligned_aa[i] else: if ins_open: v = (gene, protein, protein_id, ins_pos, "-" * ins_len, ins_seq, "INS") list_mutations.append(v) ins_open = False ins_len = 0 ins_pos = None ins_seq = "" if ins_open: v = (gene, protein, protein_id, ins_pos, "-" * ins_len, ins_seq, "INS") list_mutations.append(v) del_open = False del_len = 0 del_pos = None del_seq = "" for i in range(len(ref_aligned_aa)): if seq_aligned_aa[i] == '-': if not del_open: del_pos = ref_positions_aa[i] del_pos_seq = seq_positions_aa[i] del_open = True del_len += 1 del_seq += ref_aligned_aa[i] else: if del_open: v = (gene, protein, protein_id, del_pos, del_seq, "-" * del_len, "DEL") list_mutations.append(v) del_open = False del_len = 0 del_pos = None del_pos_seq = None del_seq = "" if del_open: v = (gene, protein, protein_id, del_pos, del_seq, "-" * del_len, "DEL") list_mutations.append(v) mut_open = False mut_len = 0 mut_pos = None mut_pos_seq = None mut_seq_original = "" mut_seq_mutated = "" for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] != '-' and seq_aligned_aa[i] != '-' and ref_aligned_aa[i] != seq_aligned_aa[i]: if not mut_open: mut_pos = ref_positions_aa[i] mut_pos_seq = seq_positions_aa[i] mut_open = True mut_len += 1 mut_seq_original += ref_aligned_aa[i] mut_seq_mutated += seq_aligned_aa[i] else: if mut_open: v = (gene, protein, protein_id, mut_pos, mut_seq_original, mut_seq_mutated, "SUB") list_mutations.append(v) mut_open = False mut_len = 0 mut_pos = None mut_pos_seq = None mut_seq_original = "" mut_seq_mutated = "" if mut_open: v = (gene, protein, protein_id, mut_pos, mut_seq_original, mut_seq_mutated, "SUB") list_mutations.append(v) list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, nuc_seq, aa_seq, list_mutations)) elif len(dna_ref) == 0: list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, None, None, [])) else: # nucleotide sequence not multiple of 3 proteins_not_multiple_of_3.append(protein) list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, nuc_seq, None, [])) elif atype == 'gene': list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, nuc_seq, None, [])) # log frameshift not detectable in the final output if len(proteins_not_multiple_of_3) > 0: logger.warning(f"sequence ID {sequence_id} has proteins of length not multiple of 3 {proteins_not_multiple_of_3}") return list_annotations
def call_annotation_variant(annotation_file, ref_aligned, seq_aligned, ref_positions, seq_positions, sequence_id = 666): table = CodonTable.ambiguous_dna_by_id[1] list_annotations = [] class Ann: def __init__(self, ann_type, ann_pos, gene, protein, protein_id, aa_seq): self.ann_type = ann_type self.ann_pos = ann_pos self.gene = gene self.protein = protein self.protein_id = protein_id self.aa_seq = aa_seq def parse_pos(l): return [(int(pos.strip().split(",")[0]), int(pos.strip().split(",")[1])) for pos in l.strip().split(";")] ref_annotations = [] with open(annotation_file) as f: for line in f: s = line.strip().split("\t") ann_type = s[2] ann_pos = parse_pos(s[3]) gene = None if s[4] == "." else s[4] protein = None if s[5] == "." else s[5] protein_id = None if s[6] == "." else s[6] aa_seq = None if s[7] == "." else s[7] ref_annotations.append(Ann(ann_type, ann_pos, gene, protein, protein_id, aa_seq)) for annotation in ref_annotations: gene = annotation.gene protein = annotation.protein protein_id = annotation.protein_id atype = annotation.ann_type nuc_start = annotation.ann_pos[0][0] nuc_stop = annotation.ann_pos[-1][1] nuc_seq = "".join( [x[1] for x in zip(ref_positions, seq_aligned) if nuc_start <= x[0] and nuc_stop >= x[0]]).replace("-", "") #if nuc_seq is not None and aa_seq is None: # logger.warning('nuc_seq is not None and aa_seq is None (' + gene + ',' + protein + ')') list_mutations = [] if annotation.ann_type == 'mature_protein_region' or annotation.ann_type == 'CDS': dna_ref = '' for (start, stop) in annotation.ann_pos: dna_ref += "".join([x[1] for x in zip(ref_positions, seq_aligned) if start <= x[0] and stop >= x[0]]).replace("-", "") if len(dna_ref)%3 == 0 and len(dna_ref) > 0: aa_seq = Seq._translate_str(dna_ref, table, cds=False).replace("*", "") alignment_aa = pairwise2.align.globalms(annotation.aa_seq, aa_seq, 3, -1, -3, -1) try: ref_aligned_aa = alignment_aa[0][0] seq_aligned_aa = alignment_aa[0][1] except IndexError: continue ref_positions_aa = np.zeros(len(seq_aligned_aa), dtype=int) pos = 0 for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] != '-': pos += 1 ref_positions_aa[i] = pos seq_positions_aa = np.zeros(len(seq_aligned_aa), dtype=int) pos = 0 for i in range(len(ref_aligned_aa)): if seq_aligned_aa[i] != '-': pos += 1 seq_positions_aa[i] = pos list_mutations = [] ins_open = False ins_len = 0 ins_pos = None ins_seq = "" for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] == '-': ins_open = True ins_len += 1 ins_pos = ref_positions_aa[i] ins_seq += seq_aligned_aa[i] else: if ins_open: v = (gene, protein, protein_id, ins_pos, "-" * ins_len, ins_seq, "INS") list_mutations.append(v) ins_open = False ins_len = 0 ins_pos = None ins_seq = "" if ins_open: v = (gene, protein, protein_id, ins_pos, "-" * ins_len, ins_seq, "INS") list_mutations.append(v) del_open = False del_len = 0 del_pos = None del_seq = "" for i in range(len(ref_aligned_aa)): if seq_aligned_aa[i] == '-': if not del_open: del_pos = ref_positions_aa[i] del_pos_seq = seq_positions_aa[i] del_open = True del_len += 1 del_seq += ref_aligned_aa[i] else: if del_open: v = (gene, protein, protein_id, del_pos, del_seq, "-" * del_len, "DEL") list_mutations.append(v) del_open = False del_len = 0 del_pos = None del_pos_seq = None del_seq = "" if del_open: v = (gene, protein, protein_id, del_pos, del_seq, "-" * del_len, "DEL") list_mutations.append(v) mut_open = False mut_len = 0 mut_pos = None mut_pos_seq = None mut_seq_original = "" mut_seq_mutated = "" for i in range(len(ref_aligned_aa)): if ref_aligned_aa[i] != '-' and seq_aligned_aa[i] != '-' and ref_aligned_aa[i] != seq_aligned_aa[i]: if not mut_open: mut_pos = ref_positions_aa[i] mut_pos_seq = seq_positions_aa[i] mut_open = True mut_len += 1 mut_seq_original += ref_aligned_aa[i] mut_seq_mutated += seq_aligned_aa[i] else: if mut_open: v = (gene, protein, protein_id, mut_pos, mut_seq_original, mut_seq_mutated, "SUB") list_mutations.append(v) mut_open = False mut_len = 0 mut_pos = None mut_pos_seq = None mut_seq_original = "" mut_seq_mutated = "" if mut_open: v = (gene, protein, protein_id, mut_pos, mut_seq_original, mut_seq_mutated, "SUB") list_mutations.append(v) list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, nuc_seq, aa_seq, list_mutations)) elif len(dna_ref) == 0: list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, None, None, [])) else: list_annotations.append( (gene, protein, protein_id, atype, nuc_start, nuc_stop, nuc_seq, None, [])) return list_annotations