Exemple #1
0
    def testTwoCodons(self):
        nucs = 'TTTCCT'
        expected_aminos = 'FP'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #2
0
    def testSingleDashAmbiguous(self):
        nucs = '-TT'
        expected_aminos = '?'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #3
0
    def testAmbiguousBasesThatAreSynonyms(self):
        nucs = 'TTY'  # TTC or TTT: both map to F
        expected_aminos = 'F'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #4
0
    def load_reading_frames(self, seed_name):
        """ Calculate reading frames along a consensus sequence.

        :param seed_name: the name of the seed to look up
        :return: {pos: frame} zero-based position and reading frame for each
            position. Frame 1 needs one nucleotide inserted at start.
        """
        result = Counter()
        conseq = self.remap_conseqs[seed_name]
        coord_refs = self.projects.getCoordinateReferences(seed_name)
        for coord_ref in coord_refs.values():
            best_alignment = (-1000000, '', '', 0)
            for frame_index in range(3):
                conseq_aminos = translate('-' * frame_index + conseq)
                aconseq, acoord, score = self._pair_align(
                    conseq_aminos, coord_ref, GAP_OPEN_COORD, GAP_EXTEND_COORD)
                best_alignment = max(best_alignment,
                                     (score, aconseq, acoord, frame_index))
            score, aconseq, acoord, frame_index = best_alignment
            if frame_index == 0:
                continue  # defaults to 0, no need to record
            conseq_codon_index = -1
            coord_codon_index = -1
            for conseq_amino, coord_amino in zip(aconseq, acoord):
                if conseq_amino != '-':
                    conseq_codon_index += 1
                if coord_amino == '-':
                    continue
                coord_codon_index += 1

                nuc_pos = conseq_codon_index * 3 - frame_index
                for i in range(3):
                    result[nuc_pos + i] = frame_index
        return result
Exemple #5
0
    def testTwoAmbiguousBasesThatAreNotSynonyms(self):
        nucs = 'RGR'  # GGA, GGG, AGA, or AGG: map to G and R, respectively
        expected_aminos = '?'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #6
0
    def testTwoAmbiguousBasesThatAreSynonyms(self):
        nucs = 'MGR'  # CGA, CGG, AGA, or AGG: all map to R
        expected_aminos = 'R'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #7
0
    def testPartialCodon(self):
        nucs = 'TTTCC'
        expected_aminos = 'F'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #8
0
    def testTwoDashes(self):
        nucs = '--T'
        expected_aminos = '?'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #9
0
    def testAmbiguousAminosListed(self):
        nucs = 'TTM'  # TTA or TTC: map to L or F
        expected_aminos = '[FL]'

        aminos = translate(nucs, list_ambiguous=True)

        self.assertEqual(expected_aminos, aminos)
Exemple #10
0
    def testReturnList(self):
        nucs = 'CGATTM'  # TTA or TTC: map to L or F
        expected_aminos = [['R'], ['F', 'L']]

        aminos = translate(nucs, return_list=True)

        self.assertEqual(expected_aminos, aminos)
Exemple #11
0
    def testMixturesNotTranslated(self):
        nucs = 'TTY'  # TTC or TTT: both map to F
        expected_aminos = '?'

        aminos = translate(nucs, translate_mixtures=False)

        self.assertEqual(expected_aminos, aminos)
Exemple #12
0
    def testListAmbiguousOverridesMixturesNotTranslated(self):
        nucs = 'TTY'
        expected_aminos = 'F'

        aminos = translate(nucs, translate_mixtures=False, list_ambiguous=True)

        self.assertEqual(expected_aminos, aminos)
Exemple #13
0
def extract_target(seed_ref, coordinate_ref):
    """ Extract a portion of the seed that aligns with the coordinate reference.

    :param seed_ref: seed reference (nucleotide sequence)
    :param coordinate_ref: coordinate reference (amino acid sequence)
    :return: subsequence of seed_ref that maps to coordinate_ref
    """
    best_alignment = (-1000000, '', '', 0)
    for frame_index in range(3):
        seed_aminos = translate('-' * frame_index + seed_ref)
        aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref,
                                           GAP_OPEN_COST, GAP_EXTEND_COST,
                                           USE_TERMINAL_COST)
        best_alignment = max(best_alignment,
                             (score, aseed, acoord, frame_index))
    score, aseed, acoord, frame_index = best_alignment
    assert score >= len(coordinate_ref) // 2, score

    target = []
    seed_index = -frame_index
    for s, c in zip(aseed, acoord):
        if s == '-':
            continue
        seed_index += 3
        if c == '-':
            continue
        target.append(seed_ref[seed_index - 3:seed_index])
    return ''.join(target)
Exemple #14
0
def find_coord_pos(projects, coord_name, start_pos, end_pos):
    coord_seq = projects.getReference(coord_name)
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    for ref_name in sorted(projects.getProjectSeeds('HCV')):
        if not ref_name.startswith('HCV-2'):
            continue
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if coord_amino != '-':
            coord_pos += 1
        if ref_amino != '-':
            ref_pos += 1
        if start_pos == coord_pos:
            ref_start = ref_pos * 3 - nuc_offset - 3
        if coord_pos == end_pos:
            ref_end = ref_pos * 3 - nuc_offset
    return ref_name, ref_start, ref_end
Exemple #15
0
def check_hiv_wild_types(project_config):
    print("""\
HIV wild types for resistance reports are extracted from Consensus B.
""")
    sequences = fetch_alignment_sequences(2004,
                                          'CON',  # Consensus/Ancestral
                                          'POL')
    consensus_b = sequences['CONSENSUS_B'].upper()

    with open(WILD_TYPES_PATH) as wild_types_file:
        wild_types = safe_load(wild_types_file)
    boundaries = {'PR': (171, 468),
                  'RT': (468, 1788),
                  'INT': (2148, 3014)}
    ref_names = sorted(boundaries.keys())
    source_wild_types = {}
    for ref_name, (start, end) in boundaries.items():
        source_nuc_sequence = consensus_b[start:end]
        source_wild_types[ref_name] = translate(source_nuc_sequence)
    report, error_count = compare_config(ref_names,
                                         project_config,
                                         source_wild_types,
                                         reference_overrides=wild_types)
    print(report)
    return error_count
Exemple #16
0
    def testThreeDashes(self):
        nucs = '---'
        expected_aminos = '-'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #17
0
    def load_reading_frames(self, seed_name):
        """ Calculate reading frames along a consensus sequence.

        :param seed_name: the name of the seed to look up
        :return: {pos: frame} zero-based position and reading frame for each
            position. Frame 1 needs one nucleotide inserted at start.
        """
        result = Counter()
        conseq = self.remap_conseqs[seed_name]
        coord_refs = self.projects.getCoordinateReferences(seed_name)
        for coord_ref in coord_refs.values():
            best_alignment = (-1000000, '', '', 0)
            for frame_index in range(3):
                conseq_aminos = translate('-'*frame_index + conseq)
                aconseq, acoord, score = self._pair_align(conseq_aminos,
                                                          coord_ref,
                                                          GAP_OPEN_COORD,
                                                          GAP_EXTEND_COORD)
                best_alignment = max(best_alignment, (score, aconseq, acoord, frame_index))
            score, aconseq, acoord, frame_index = best_alignment
            if frame_index == 0:
                continue  # defaults to 0, no need to record
            conseq_codon_index = -1
            coord_codon_index = -1
            for conseq_amino, coord_amino in zip(aconseq, acoord):
                if conseq_amino != '-':
                    conseq_codon_index += 1
                if coord_amino == '-':
                    continue
                coord_codon_index += 1
                
                nuc_pos = conseq_codon_index * 3 - frame_index
                for i in range(3):
                    result[nuc_pos+i] = frame_index
        return result
Exemple #18
0
    def testSingleDashUnambiguous(self):
        nucs = 'CG-'  # CGA, CGC, CGG, CGT all map to R
        expected_aminos = 'R'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #19
0
    def testLowerCase(self):
        nucs = 'TttCCT'
        expected_aminos = 'FP'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #20
0
def extract_target(seed_ref, coordinate_ref):
    """ Extract a portion of the seed that aligns with the coordinate reference.

    :param seed_ref: seed reference (nucleotide sequence)
    :param coordinate_ref: coordinate reference (amino acid sequence)
    :return: subsequence of seed_ref that maps to coordinate_ref
    """
    best_alignment = (-1000000, '', '', 0)
    for frame_index in range(3):
        seed_aminos = translate('-'*frame_index + seed_ref)
        aseed, acoord, score = align_it_aa(seed_aminos,
                                           coordinate_ref,
                                           GAP_OPEN_COST,
                                           GAP_EXTEND_COST,
                                           USE_TERMINAL_COST)
        best_alignment = max(best_alignment, (score, aseed, acoord, frame_index))
    score, aseed, acoord, frame_index = best_alignment
    assert score >= len(coordinate_ref) // 2, score

    target = []
    seed_index = -frame_index
    for s, c in zip(aseed, acoord):
        if s == '-':
            continue
        seed_index += 3
        if c == '-':
            continue
        target.append(seed_ref[seed_index-3:seed_index])
    return ''.join(target)
Exemple #21
0
    def testSingleCodon(self):
        nucs = 'TTT'
        expected_aminos = 'F'

        aminos = translate(nucs)

        self.assertEqual(expected_aminos, aminos)
Exemple #22
0
    def testOffset(self):
        nucs = "TTTCCT"
        offset = 3
        expected_aminos = "-FP"

        aminos = translate(nucs, offset)

        self.assertEqual(expected_aminos, aminos)
Exemple #23
0
def check_hcv_coordinates(project_config, unchecked_ref_names: set):
    print("""\
Most HCV coordinate references were listed in the FDA guidance:
https://www.fda.gov/downloads/Drugs/GuidanceComplianceRegulatoryInformation/Guidances/UCM340712.pdf
This script contains a complete list of the reference accession numbers.
""")
    accession_numbers = {'HCV1A': 'NC_004102',
                         'HCV1B': 'AJ238799',
                         'HCV2': 'AB047639',
                         'HCV3': 'GU814263',
                         'HCV4': 'GU814265',
                         'HCV5': 'AF064490',
                         'HCV6': 'Y12083',
                         'HCV7': 'EF108306'}
    source_nuc_sequences = {
        genotype: fetch_hcv_by_accession(accession_number)
        for genotype, accession_number in accession_numbers.items()}

    gene_names = [
        'Core', 'E1', 'E2', 'p7', 'NS2', 'NS3', 'NS4a', 'NS4b', 'NS5a', 'NS5b']
    # Boundary positions are from the European HCV database records.
    # https://euhcvdb.ibcp.fr/euHCVdb/do/displayHCVEntry?primaryAC=AF009606
    # That is the original H77 accession number for HCV1A. NC_004102 is the
    # curated and annotated version that was derived from the AF009606 entry.
    # All the other genotypes can be found by their regular accession numbers.

    genotype_boundaries = {
        #         Core E1   E2    p7    NS2   NS3   NS4a  NS4b  NS5a  NS5b
        'HCV1A': [342, 915, 1491, 2580, 2769, 3420, 5313, 5475, 6258, 7602, 9375],
        'HCV1B': [342, 915, 1491, 2580, 2769, 3420, 5313, 5475, 6258, 7599, 9372],
        'HCV2': [341, 914, 1490, 2591, 2780, 3431, 5324, 5486, 6269, 7667, 9440],
        'HCV3': [340, 913, 1489, 2596, 2785, 3436, 5329, 5491, 6274, 7630, 9403],
        'HCV4': [341, 914, 1490, 2579, 2768, 3419, 5312, 5474, 6257, 7592, 9365],
        'HCV5': [247, 820, 1396, 2488, 2677, 3328, 5221, 5383, 6166, 7516, 9289],
        'HCV6': [284, 857, 1433, 2534, 2723, 3374, 5267, 5429, 6212, 7565, 9338],
        'HCV7': [309, 882, 1458, 2559, 2748, 3399, 5292, 5454, 6237, 7575, 9348]}

    hcv_project = project_config.config['projects']['HCV']
    ref_names = {project_region['coordinate_region']
                 for project_region in hcv_project['regions']}
    unchecked_ref_names.difference_update(ref_names)

    source_sequences = {}
    for ref_name in sorted(ref_names):
        ref_parts = ref_name.split('-')
        genotype = ref_parts[0]
        gene_name = ref_parts[-1]
        gene_index = gene_names.index(gene_name)
        boundaries = genotype_boundaries[genotype]
        start, stop = boundaries[gene_index:gene_index+2]
        nuc_seq_ref_trimmed = source_nuc_sequences[genotype][start-1:stop-1]
        source_sequences[ref_name] = translate(nuc_seq_ref_trimmed)

    report, error_count = compare_config(ref_names,
                                         project_config,
                                         source_sequences)
    print(report)
    return error_count
Exemple #24
0
    def testReturnListWithoutMixtures(self):
        """ Don't know why you would use this combination, but stay sane. """

        nucs = 'CGATTM'  # TTA or TTC: map to L or F
        expected_aminos = [['R'], ['?']]

        aminos = translate(nucs, return_list=True, translate_mixtures=False)

        self.assertEqual(expected_aminos, aminos)
Exemple #25
0
    def testStatisticsUnambiguous(self):
        nucs = 'TTATTCTTTTTA'
        expected_aminos = 'LFFL'
        stats = {}
        expected_stats = dict(length=4, ambiguous=0, max_aminos=1)

        aminos = translate(nucs, stats=stats, list_ambiguous=True)

        self.assertEqual(expected_aminos, aminos)
        self.assertEqual(expected_stats, stats)
Exemple #26
0
    def testStatisticsBlank(self):
        nucs = ''
        expected_aminos = ''
        stats = {}
        expected_stats = dict(length=0, ambiguous=0, max_aminos=0)

        aminos = translate(nucs, stats=stats, list_ambiguous=True)

        self.assertEqual(expected_aminos, aminos)
        self.assertEqual(expected_stats, stats)
Exemple #27
0
    def testStatisticsAmbiguous(self):
        nucs = 'TTMTTCNTTTTA'
        expected_aminos = '[FL]F[FILV]L'
        stats = {}
        expected_stats = dict(length=4, ambiguous=2, max_aminos=4)

        aminos = translate(nucs, stats=stats, list_ambiguous=True)

        self.assertEqual(expected_aminos, aminos)
        self.assertEqual(expected_stats, stats)
Exemple #28
0
 def count_aminos(self, codon_seq, count):
     """ Record a set of reads at this position in the seed reference.
     @param codon_seq: a string of three nucleotides that were read at this
                       position
     @param count: the number of times they were read
     """
     amino = translate(codon_seq.upper())
     if amino in AMINO_ALPHABET:
         self.counts[amino] += count
     for i in range(3):
         self.nucleotides[i].count_nucleotides(codon_seq[i], count)
Exemple #29
0
def check_hcv_coordinates(project_config, unchecked_ref_names: set):
    print("""\
Most HCV coordinate references were listed in the FDA guidance:
https://www.fda.gov/downloads/Drugs/GuidanceComplianceRegulatoryInformation/Guidances/UCM340712.pdf
This script contains a complete list of the reference accession numbers.
""")
    accession_numbers = {
        'HCV1A': 'NC_004102',
        'HCV1B': 'AJ238799',
        'HCV2': 'AB047639',
        'HCV3': 'GU814263',
        'HCV4': 'GU814265',
        'HCV5': 'AF064490',
        'HCV6': 'Y12083',
        # EF108306.2 is available, but only extends 5' and 3'.
        'HCV7': 'EF108306.1'
    }
    source_nuc_sequences = {
        genotype: fetch_by_accession(accession_number)
        for genotype, accession_number in accession_numbers.items()
    }

    # Boundary positions in landmarks are from the European HCV database records.
    # https://euhcvdb.ibcp.fr/euHCVdb/do/displayHCVEntry?primaryAC=AF009606
    # That is the original H77 accession number for HCV1A. NC_004102 is the
    # curated and annotated version that was derived from the AF009606 entry.
    # All the other genotypes can be found by their regular accession numbers.

    hcv_project = project_config.config['projects']['HCV']
    ref_names = {
        project_region['coordinate_region']
        for project_region in hcv_project['regions']
    }
    unchecked_ref_names.difference_update(ref_names)
    landmark_reader = LandmarkReader.load()

    source_sequences = {}
    for ref_name in sorted(ref_names):
        ref_parts = ref_name.split('-')
        genotype = ref_parts[0]
        seed_name = f'HCV-{genotype[3:].lower()}'
        if len(seed_name) == 5:
            seed_name += 'a'

        region = landmark_reader.get_gene(seed_name, ref_name)
        start, stop = region['start'], region['end']
        nuc_seq_ref_trimmed = source_nuc_sequences[genotype][start - 1:stop -
                                                             1]
        source_sequences[ref_name] = translate(nuc_seq_ref_trimmed)

    report, error_count = compare_config(ref_names, project_config,
                                         source_sequences)
    print(report)
    return error_count
Exemple #30
0
def find_coord_pos(projects: ProjectConfig,
                   coord_name: str,
                   start_pos: int = None,
                   end_pos: int = None):
    coord_seq = projects.getReference(coord_name)
    if start_pos is None:
        start_pos = 1
    if end_pos is None:
        end_pos = len(coord_seq) + 1
    if projects.config['regions'][coord_name]['is_nucleotide']:
        # Already have a nucleotide sequence, nothing to do.
        return coord_name, start_pos, end_pos
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
Exemple #31
0
 def count_aminos(self, codon_seq, count):
     """ Record a set of reads at this position in the seed reference.
     @param codon_seq: a string of three nucleotides that were read at this
                       position, may be padded with spaces at the start
                       or end of a sequence, or dashes for deletions
     @param count: the number of times they were read
     """
     if 'N' in codon_seq:
         self.low_quality += count
     elif '---' == codon_seq:
         self.deletions += count
     elif '-' in codon_seq:
         self.partial += count
     elif ' ' not in codon_seq and 'n' not in codon_seq:
         amino = translate(codon_seq.upper())
         self.counts[amino] += count
     for i, nuc in enumerate(codon_seq):
         if nuc != ' ':
             seed_nucleotide = self.nucleotides[i]
             seed_nucleotide.count_nucleotides(nuc, count)
Exemple #32
0
def check_hla_coordinates(project_config, unchecked_ref_names: set):
    print("""\
HLA coordinate references are translated from the seed reference.
""")
    ref_names = ('HLA-B-exon2', 'HLA-B-exon3')

    seed_sequence = project_config.getReference('HLA-B-seed')
    unchecked_ref_names.difference_update(ref_names)
    landmark_reader = LandmarkReader.load()

    source_sequences = {}
    for ref_name in ref_names:
        region = landmark_reader.get_gene('HLA-B-seed', ref_name[6:])
        source_nuc_sequence = seed_sequence[region['start']:region['end']]
        source_sequences[ref_name] = translate(source_nuc_sequence)

    report, error_count = compare_config(ref_names, project_config,
                                         source_sequences)
    print(report)
    return error_count
Exemple #33
0
 def count_aminos(self, codon_seq, count):
     """ Record a set of reads at this position in the seed reference.
     @param codon_seq: a string of three nucleotides that were read at this
                       position, may be padded with spaces at the start
                       or end of a sequence, or dashes for deletions
     @param count: the number of times they were read
     """
     if 'N' in codon_seq:
         self.low_quality += count
     elif '---' == codon_seq:
         self.deletions += count
     elif '-' in codon_seq:
         self.partial += count
     elif ' ' not in codon_seq and 'n' not in codon_seq:
         amino = translate(codon_seq.upper())
         self.counts[amino] += count
     for i, nuc in enumerate(codon_seq):
         if nuc != ' ':
             seed_nucleotide = self.nucleotides[i]
             seed_nucleotide.count_nucleotides(nuc, count)
Exemple #34
0
def find_coord_pos(projects, coord_name, start_pos, end_pos):
    coord_seq = projects.getReference(coord_name)
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq,
                ref_amino_seq,
                gap_open,
                gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
Exemple #35
0
def check_hla_coordinates(project_config, unchecked_ref_names: set):
    print("""\
HLA coordinate references are translated from the seed reference.
""")
    boundaries = {'HLA-B-exon2': (200, 470),
                  'HLA-B-exon3': (716, 992)}

    seed_sequence = project_config.getReference('HLA-B-seed')
    ref_names = sorted(boundaries.keys())
    unchecked_ref_names.difference_update(ref_names)

    source_sequences = {}
    for ref_name, (start, end) in boundaries.items():
        source_nuc_sequence = seed_sequence[start:end]
        source_sequences[ref_name] = translate(source_nuc_sequence)

    report, error_count = compare_config(ref_names,
                                         project_config,
                                         source_sequences)
    print(report)
    return error_count
Exemple #36
0
def check_sars_coordinates(project_config, unchecked_ref_names: set):
    print("""\
SARS-CoV-2 coordinate references are translated from the seed reference.
""")
    ref_names = ('SARS-CoV-2-ORF1ab', 'SARS-CoV-2-S', 'SARS-CoV-2-ORF3a',
                 'SARS-CoV-2-E', 'SARS-CoV-2-M', 'SARS-CoV-2-ORF6',
                 'SARS-CoV-2-ORF7a', 'SARS-CoV-2-ORF7b', 'SARS-CoV-2-ORF8',
                 'SARS-CoV-2-N', 'SARS-CoV-2-ORF10', 'SARS-CoV-2-nsp1',
                 'SARS-CoV-2-nsp2', 'SARS-CoV-2-nsp3', 'SARS-CoV-2-nsp4',
                 'SARS-CoV-2-nsp5', 'SARS-CoV-2-nsp6', 'SARS-CoV-2-nsp7',
                 'SARS-CoV-2-nsp8', 'SARS-CoV-2-nsp9', 'SARS-CoV-2-nsp10',
                 'SARS-CoV-2-nsp12', 'SARS-CoV-2-nsp13', 'SARS-CoV-2-nsp14',
                 'SARS-CoV-2-nsp15', 'SARS-CoV-2-nsp16')

    # Funky translation at this base: it gets duplicated.
    duplicated_base = 13468
    seed_sequence = project_config.getReference('SARS-CoV-2-seed')
    unchecked_ref_names.difference_update(ref_names)
    landmark_reader = LandmarkReader.load()

    source_sequences = {}
    for ref_name in ref_names:
        region = landmark_reader.get_gene('SARS-CoV-2-seed', ref_name)
        start = region['start']
        end = region['end']
        source_nuc_sequence = seed_sequence[start - 1:end -
                                            3]  # Trim stop codons.
        if start <= duplicated_base <= end:
            source_nuc_sequence = (
                source_nuc_sequence[:duplicated_base - start + 1] +
                source_nuc_sequence[duplicated_base - start:])
        source_sequences[ref_name] = translate(source_nuc_sequence)
        print(ref_name, len(source_sequences[ref_name]))

    report, error_count = compare_config(ref_names, project_config,
                                         source_sequences)
    print(report)
    return error_count
Exemple #37
0
def check_hiv_wild_types(project_config):
    print("""\
HIV wild types for resistance reports are extracted from Consensus B.
""")
    sequences = fetch_alignment_sequences(
        2004,
        'CON',  # Consensus/Ancestral
        'POL')
    consensus_b = sequences['CONSENSUS_B'].upper()

    with open(WILD_TYPES_PATH) as wild_types_file:
        wild_types = safe_load(wild_types_file)
    boundaries = {'PR': (171, 468), 'RT': (468, 1788), 'INT': (2148, 3014)}
    ref_names = sorted(boundaries.keys())
    source_wild_types = {}
    for ref_name, (start, end) in boundaries.items():
        source_nuc_sequence = consensus_b[start:end]
        source_wild_types[ref_name] = translate(source_nuc_sequence)
    report, error_count = compare_config(ref_names,
                                         project_config,
                                         source_wild_types,
                                         reference_overrides=wild_types)
    print(report)
    return error_count
Exemple #38
0
    def write(self, inserts, region, report_aminos=None):
        """ Write any insert ranges to the file.

        Sequence data comes from the reads that were added to the current group.
        @param inserts: indexes of positions in the reads that should be
            reported as insertions.
        @param region: the name of the coordinate region the current group was
            mapped to
        @param report_aminos: a list of ReportAmino objects that represent the
            sequence that successfully mapped to the coordinate reference.
        """
        if len(inserts) == 0:
            return

        report_aminos = report_aminos or []

        region_insert_pos_counts = self.insert_pos_counts[(self.seed, region)]
        inserts = list(inserts)
        inserts.sort()

        # convert insertion coordinates into contiguous ranges
        insert_ranges = []
        for insert in inserts:
            if not insert_ranges or insert != insert_ranges[-1][1]:
                # just starting or we hit a gap
                insert_ranges.append([insert, insert + 3])
            else:
                insert_ranges[-1][1] += 3

        # enumerate insertions by popping out all AA sub-string variants
        insert_counts = OrderedDict()  # {left: {insert_seq: count}}
        insert_targets = {}  # {left: inserted_before_pos}
        for left, right in insert_ranges:
            for report_amino in report_aminos:
                seed_amino = report_amino.seed_amino
                if seed_amino.consensus_nuc_index == right:
                    insert_targets[left] = report_amino.position
                    break
            current_counts = Counter()
            insert_counts[left] = current_counts
            for nuc_seq, count in self.nuc_seqs.items():
                insert_nuc_seq = nuc_seq[left:right]
                is_valid = (insert_nuc_seq and
                            'n' not in insert_nuc_seq and
                            '-' not in insert_nuc_seq)
                if is_valid:
                    insert_amino_seq = translate(insert_nuc_seq)
                    if insert_amino_seq:
                        current_counts[insert_amino_seq] += count

        # record insertions to CSV
        for left, counts in insert_counts.items():
            for insert_seq, count in counts.most_common():
                insert_before = insert_targets.get(left)
                # Only care about insertions in the middle of the sequence,
                # so ignore any that come before or after the reference.
                # Also report if we're in test mode (no report_aminos).
                if not report_aminos or insert_before not in (1, None):
                    row = dict(seed=self.seed,
                               region=region,
                               qcut=self.qcut,
                               left=left + 1,
                               insert=insert_seq,
                               count=count,
                               before=insert_before)
                    self.insert_writer.writerow(row)
                    if insert_before is not None:
                        region_insert_pos_counts[insert_before-1] += count
Exemple #39
0
    def align_aminos(self, seq, gapIns=3, removeinserts=False, qachecks=False):
        """
        Align amino acids to a standard reference using gotoh.cpp
        :param seq:  AA sequence in list form, to align against reference standard
        :param removeinserts:  Whether to remove insertions relative to standard
        :param qachecks:  These are not used when [seq] is a list
        :return:
        """
        std = self.std_v3

        if qachecks:
            if seq is None:
                return -1, None
            if (len(seq) % 3 != 0) or len(seq) < 96:
                return -1, None
            if seq.startswith('----') or seq.endswith('----'):
                return -1, None

        if type(seq) is list:
            aa_lists = seq  # aa_seq in pssm_lib.rb
        else:
            # assume this is a codon sequence
            aa_lists = translate(seq=seq, offset=0, resolve=False, return_list=True, ambig_char='X')

        for i, aa_list in enumerate(aa_lists):
            for j, aa in enumerate(aa_list):
                if aa == 'X':
                    aa_list[j] = '-'
            if len(aa_list) > 1 and '*' in aa_list:
                aa_lists[i] = [aa for aa in aa_list if aa != '*']

        while ['-'] in aa_lists:
            aa_lists.remove(['-'])

        # resolve into string
        aa_seq = ''.join(aa_list[0] for aa_list in aa_lists)  # aa_seq_s in pssm_lib.rb

        if qachecks:
            if any(['*' in aa_list for aa_list in aa_lists]):
                return -1, None

        std = std.replace('-', 'X')  # fix gaps in reference
        aligned_std, aligned_seq = gotoh.align_it_aa_rb(std, aa_seq, gapIns, 1)  # method_recall
        aligned_std = aligned_std.replace('X', '-')
        std = std.replace('X', '-')  # restore original state

        # apply alignment to lists
        aa_seq = aligned_seq
        for i in range(len(aa_seq)):
            if aa_seq[i] == '-':
                aa_lists.insert(i, ['-'])  # insert before index, like Ruby

        indels = False
        if removeinserts and '-' in aligned_std:
            new_aa_lists = []
            indices = range(len(aligned_std))
            indices.reverse()
            for i in indices:
                if aligned_std[i] == '-':
                    # skip positions that are insertions relative to standard
                    indels = True
                    continue
                new_aa_lists.append(aa_lists[i])
            aa_lists = new_aa_lists
        else:
            if aligned_std != std:
                # reject sequences with insertions relative to standard
                return -2, None

        return aa_lists, indels