Beispiel #1
0
def find_protein_diff(read, ref, verbose=False, start_offset=3, end_trail=3):

    # quality control
    if read is None:
        return None, None
    ends = findEnds(read, ref, start_offset)
    if not endMatch(read, ref, ends):
        return None, None

    newread = read
    newref = ref

    # scan reference triplet by triplet
    # move letters when encountering an indel
    prot_errors = []
    prot_short = []

    i = ends.get('aligned')
    ref_index = int((ends.get('aligned') - start_offset) /
                    3) + 1  # reference amino acid index
    max_i = len(ref) - end_trail

    while i <= ends.get('end'):
        if i > max_i:
            break

        if newread is None:
            break
        ref_codon = newref[i:i + 3]
        read_codon = newread[i:i + 3]

        if '-' in read_codon:  # found a deletion
            # Check if this is the last acid, and it's incomplete, ignore it.
            if re.search('[ATGC]', str(newread[i + 3:])) is None:
                break

            if '-' in ref_codon:  # something very broken
                prot_errors.append((ref_index, 'f'))
                prot_short.append('f')
                break
            elif read_codon == '---':  # single codon deletion
                if ref_index > 0:
                    prot_errors += [(ref_index, 'd')]
                    prot_short.append(str(ref_index) + 'Δ')
                i += 3
                ref_index += 1

            else:  # check it's not a frame shift
                l = indel_len(newread, i)
                if l % 3 != 0:
                    prot_errors.append((ref_index, 'f'))
                    prot_short.append('f')
                    break
                # realign gap and repeat loop at same position to compare the codons
                gap = findGap(newread[i - 1:])
                gap = (gap[0] + i - 1, gap[1] + i - 1)
                newread = gapAlign(newread, gap, start_offset)
                continue

        elif '-' in ref_codon:  # found an insertion
            l = indel_len(newref, i)
            if l % 3 != 0:
                prot_errors.append((ref_index, 'f'))
                prot_short.append('f')
                break
            gap = findGap(newref[i - 1:])
            if gap[0] == 1:  # insertion after codon
                insertion = newread[gap[0] + i - 1:gap[1] + i - 1]
                if '-' in insertion:
                    prot_errors.append((ref_index, 'f'))
                    prot_short.append('f')
                    break
                if ref_index > 0:
                    prot_errors.append(
                        (ref_index - 1, 'i', str(translate(insertion))
                         ))  # position before + insertion
                    stop, inslist = format_insertion(ref_index - 1, insertion)
                    prot_short += inslist
                    if stop:
                        break
                i += l
                ref_index += 1
            else:  # realign gap and repeat loop at same position to compare the codons
                gap = (gap[0] + i - 1, gap[1] + i - 1)
                newref = gapAlign(newref, gap, start_offset)
                continue

        elif translate(read_codon) != translate(
                ref_codon):  # must be a substitution
            if ref_index > 0:
                prot_errors.append(
                    (ref_index, 's', str(translate(read_codon))))
                prot_short.append(
                    str(
                        translate(ref_codon) + str(ref_index) +
                        str(translate(read_codon))))
            if str(translate(read_codon)) == '*':
                break
            i += 3
            ref_index += 1

        else:
            i += 3
            ref_index += 1

    if verbose:
        print(prot_errors)

    if prot_short == []:
        short = 'wt'
    else:
        short = '/'.join(prot_short)

    return tuple(prot_errors), short
Beispiel #2
0
def find_dna_hgvs(read,
                  ref,
                  refname,
                  verbose=False,
                  start_offset=3,
                  end_trail=3):
    """@ read, ref: MutableSeq objects
    :return errors - tuple (position, expected triplet, actual triplet, ) / none if broken read

    The assumption is that the reference includes an offset of 3 nt either side of the gene of interest, such that the
    starting triplet is reported as 'amino acid 0'. If the offset is different, it needs to be set explicitly.
    end_trail specifies the  number of nt after end of gene and is ignored
    """
    if read is None:
        if verbose:
            print('no read provided')
        return

    # No gap realignment at this point
    prefix = str(refname) + ':c.'

    # quality control that there are no mutations at ends of reads
    ends = findEnds(read, ref, start_offset)
    if not endMatch(read, ref, ends):
        if verbose:
            print('ends do not match')
        return

    # scan read & reference letter by letter, counting position in reference
    # reads have been trimmed so that reference starts @ 3 (0,1,2 is the extra triplet)
    # in the general case, reference starts @ offset in 0-count
    # This is equal to the number of nt before ATG
    # ref_index denotes HGVS DNA position labeling, i is used for accessing sequence
    dna_errors = []
    ref_index = ends.get(
        'start'
    ) - start_offset + 1  # if the read starts at 3, this becomes nt 1 (1-based as is HGVS)
    i = ends.get('start')
    max_i = len(ref) - end_trail

    while i < ends.get('end'):
        if i > max_i:  # the trailing nt are ignored when reading mutations
            break
        # check for differences
        if read[i] == ref[i]:
            ref_index += 1
            i += 1

        elif read[i] == '-':
            # start of a deletion, format depends on length
            l = indel_len(read, i)
            if ref_index > 0:
                if l == 1:  # format is POSdel
                    dna_errors.append(str(ref_index) + 'del')
                else:
                    # format is FIRST_LASTdel
                    dna_errors.append(
                        str(ref_index) + '_' + str(ref_index + l - 1) + 'del')
            i += l
            ref_index += l

        elif ref[i] == '-':
            # start of an insertion, format is FLANK_FLANKinsSEQ
            l = indel_len(ref, i)
            if ref_index > 0:
                dna_errors.append(
                    str(ref_index - 1) + '_' + str(ref_index) + 'ins' +
                    str(read[i:i + l]))
            i += l

        else:
            # substitution: need to include ref. sequence in format 8A>G
            if ref_index > 0:
                dna_errors.append(
                    str(ref_index) + str(ref[i]) + '>' + str(read[i]))
            i += 1
            ref_index += 1

    # format the result including name of sequence
    if len(dna_errors) == 1:
        dna_hgvs = prefix + dna_errors[0]
    else:
        dna_hgvs = prefix + '[' + ';'.join(dna_errors) + ']'

    return dna_hgvs
Beispiel #3
0
def find_dna_diff(read, ref, verbose=False, start_offset=3, end_trail=3):
    """
    @ read, ref: MutableSeq objects
    :return errors - tuple (position, expected triplet, actual triplet, ) / none if broken read

    The default assumption is that the reference includes 3 nt either side of the gene of interest. The starting triplet is
    reported as 'amino acid 0'. The number of such ignored triplets is set by start-offset.
    As for HGVS, the starting offset and number of trailing nt are variable
    Letter by letter report mutations in NGS read, all counts 1- based in result (code in 0-count).
    - substitution: 78C = nt 78 in reference is changed to C
    - deletions: 78d6 = 6 nt deleted starting with 78: 1-77, d6, 84-end
    - insertion: 78iATC = after nt 78 inserted seq. ATC
    """

    if read is None:
        if verbose:
            print('no read provided')
        return

    # No gap realignment at this point

    # quality control that there are no mutations at ends of reads
    ends = findEnds(read, ref, start_offset)
    if not endMatch(read, ref, ends):
        if verbose:
            print('ends do not match')
        return

    # scan read & reference letter by letter, counting position in reference
    # reads have been trimmed so that reference starts @ offset=3 by default (0,1,2 is the extra triplet)
    dna_errors = []
    ref_index = ends.get('start') - start_offset + 1
    i = ends.get('start')
    max_i = len(ref) - end_trail

    while i < ends.get('end'):
        if i > max_i:
            break
        # check for differences
        if read[i] == ref[i]:
            ref_index += 1
            i += 1

        elif read[i] == '-':
            # start of a deletion
            l = indel_len(read, i)
            # now we know the length of a deletion, check for frameshifts
            if l % 3 == 0:
                if ref_index > 0:
                    dna_errors += [
                        (str(ref_index), 'd', str(l))
                    ]  # deletion length l starting at ref_index in 0-count
                i += l
                ref_index += l
            else:
                dna_errors += [(str(ref_index), 'f')]
                break

        elif ref[i] == '-':
            # start of an insertion
            l = indel_len(ref, i)
            # check for frameshifts
            if l % 3 == 0:
                if ref_index > 0:
                    dna_errors += [(str(ref_index), 'i', str(read[i:i + l]))]
                i += l
            else:
                dna_errors += [(str(ref_index), 'f')]
                break

        else:
            # substitution
            if ref_index > 0:
                dna_errors += [(str(ref_index + 1), 's', str(read[i]))]
            i += 1
            ref_index += 1

    return tuple(dna_errors)