def find_protein_diff(read, ref, verbose=False, start_offset=3, end_trail=3): # quality control if read is None: return None, None ends = findEnds(read, ref, start_offset) if not endMatch(read, ref, ends): return None, None newread = read newref = ref # scan reference triplet by triplet # move letters when encountering an indel prot_errors = [] prot_short = [] i = ends.get('aligned') ref_index = int((ends.get('aligned') - start_offset) / 3) + 1 # reference amino acid index max_i = len(ref) - end_trail while i <= ends.get('end'): if i > max_i: break if newread is None: break ref_codon = newref[i:i + 3] read_codon = newread[i:i + 3] if '-' in read_codon: # found a deletion # Check if this is the last acid, and it's incomplete, ignore it. if re.search('[ATGC]', str(newread[i + 3:])) is None: break if '-' in ref_codon: # something very broken prot_errors.append((ref_index, 'f')) prot_short.append('f') break elif read_codon == '---': # single codon deletion if ref_index > 0: prot_errors += [(ref_index, 'd')] prot_short.append(str(ref_index) + 'Δ') i += 3 ref_index += 1 else: # check it's not a frame shift l = indel_len(newread, i) if l % 3 != 0: prot_errors.append((ref_index, 'f')) prot_short.append('f') break # realign gap and repeat loop at same position to compare the codons gap = findGap(newread[i - 1:]) gap = (gap[0] + i - 1, gap[1] + i - 1) newread = gapAlign(newread, gap, start_offset) continue elif '-' in ref_codon: # found an insertion l = indel_len(newref, i) if l % 3 != 0: prot_errors.append((ref_index, 'f')) prot_short.append('f') break gap = findGap(newref[i - 1:]) if gap[0] == 1: # insertion after codon insertion = newread[gap[0] + i - 1:gap[1] + i - 1] if '-' in insertion: prot_errors.append((ref_index, 'f')) prot_short.append('f') break if ref_index > 0: prot_errors.append( (ref_index - 1, 'i', str(translate(insertion)) )) # position before + insertion stop, inslist = format_insertion(ref_index - 1, insertion) prot_short += inslist if stop: break i += l ref_index += 1 else: # realign gap and repeat loop at same position to compare the codons gap = (gap[0] + i - 1, gap[1] + i - 1) newref = gapAlign(newref, gap, start_offset) continue elif translate(read_codon) != translate( ref_codon): # must be a substitution if ref_index > 0: prot_errors.append( (ref_index, 's', str(translate(read_codon)))) prot_short.append( str( translate(ref_codon) + str(ref_index) + str(translate(read_codon)))) if str(translate(read_codon)) == '*': break i += 3 ref_index += 1 else: i += 3 ref_index += 1 if verbose: print(prot_errors) if prot_short == []: short = 'wt' else: short = '/'.join(prot_short) return tuple(prot_errors), short
def find_dna_hgvs(read, ref, refname, verbose=False, start_offset=3, end_trail=3): """@ read, ref: MutableSeq objects :return errors - tuple (position, expected triplet, actual triplet, ) / none if broken read The assumption is that the reference includes an offset of 3 nt either side of the gene of interest, such that the starting triplet is reported as 'amino acid 0'. If the offset is different, it needs to be set explicitly. end_trail specifies the number of nt after end of gene and is ignored """ if read is None: if verbose: print('no read provided') return # No gap realignment at this point prefix = str(refname) + ':c.' # quality control that there are no mutations at ends of reads ends = findEnds(read, ref, start_offset) if not endMatch(read, ref, ends): if verbose: print('ends do not match') return # scan read & reference letter by letter, counting position in reference # reads have been trimmed so that reference starts @ 3 (0,1,2 is the extra triplet) # in the general case, reference starts @ offset in 0-count # This is equal to the number of nt before ATG # ref_index denotes HGVS DNA position labeling, i is used for accessing sequence dna_errors = [] ref_index = ends.get( 'start' ) - start_offset + 1 # if the read starts at 3, this becomes nt 1 (1-based as is HGVS) i = ends.get('start') max_i = len(ref) - end_trail while i < ends.get('end'): if i > max_i: # the trailing nt are ignored when reading mutations break # check for differences if read[i] == ref[i]: ref_index += 1 i += 1 elif read[i] == '-': # start of a deletion, format depends on length l = indel_len(read, i) if ref_index > 0: if l == 1: # format is POSdel dna_errors.append(str(ref_index) + 'del') else: # format is FIRST_LASTdel dna_errors.append( str(ref_index) + '_' + str(ref_index + l - 1) + 'del') i += l ref_index += l elif ref[i] == '-': # start of an insertion, format is FLANK_FLANKinsSEQ l = indel_len(ref, i) if ref_index > 0: dna_errors.append( str(ref_index - 1) + '_' + str(ref_index) + 'ins' + str(read[i:i + l])) i += l else: # substitution: need to include ref. sequence in format 8A>G if ref_index > 0: dna_errors.append( str(ref_index) + str(ref[i]) + '>' + str(read[i])) i += 1 ref_index += 1 # format the result including name of sequence if len(dna_errors) == 1: dna_hgvs = prefix + dna_errors[0] else: dna_hgvs = prefix + '[' + ';'.join(dna_errors) + ']' return dna_hgvs
def find_dna_diff(read, ref, verbose=False, start_offset=3, end_trail=3): """ @ read, ref: MutableSeq objects :return errors - tuple (position, expected triplet, actual triplet, ) / none if broken read The default assumption is that the reference includes 3 nt either side of the gene of interest. The starting triplet is reported as 'amino acid 0'. The number of such ignored triplets is set by start-offset. As for HGVS, the starting offset and number of trailing nt are variable Letter by letter report mutations in NGS read, all counts 1- based in result (code in 0-count). - substitution: 78C = nt 78 in reference is changed to C - deletions: 78d6 = 6 nt deleted starting with 78: 1-77, d6, 84-end - insertion: 78iATC = after nt 78 inserted seq. ATC """ if read is None: if verbose: print('no read provided') return # No gap realignment at this point # quality control that there are no mutations at ends of reads ends = findEnds(read, ref, start_offset) if not endMatch(read, ref, ends): if verbose: print('ends do not match') return # scan read & reference letter by letter, counting position in reference # reads have been trimmed so that reference starts @ offset=3 by default (0,1,2 is the extra triplet) dna_errors = [] ref_index = ends.get('start') - start_offset + 1 i = ends.get('start') max_i = len(ref) - end_trail while i < ends.get('end'): if i > max_i: break # check for differences if read[i] == ref[i]: ref_index += 1 i += 1 elif read[i] == '-': # start of a deletion l = indel_len(read, i) # now we know the length of a deletion, check for frameshifts if l % 3 == 0: if ref_index > 0: dna_errors += [ (str(ref_index), 'd', str(l)) ] # deletion length l starting at ref_index in 0-count i += l ref_index += l else: dna_errors += [(str(ref_index), 'f')] break elif ref[i] == '-': # start of an insertion l = indel_len(ref, i) # check for frameshifts if l % 3 == 0: if ref_index > 0: dna_errors += [(str(ref_index), 'i', str(read[i:i + l]))] i += l else: dna_errors += [(str(ref_index), 'f')] break else: # substitution if ref_index > 0: dna_errors += [(str(ref_index + 1), 's', str(read[i]))] i += 1 ref_index += 1 return tuple(dna_errors)