def write_cyst_file(known_cyst_positions):
    unaligned_genes = utils.read_germlines(args.dirname, only_region='v')['v']
    aligned_genes = utils.read_germlines(args.dirname, only_region='v', aligned=True)['v']

    common_gene = None  # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions
    for gene, info in known_cyst_positions.items():
        if gene in aligned_genes:
            common_gene = gene
            break
    if common_gene is None:
        raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname))

    aligned_seq = aligned_genes[common_gene]
    seq = unaligned_genes[common_gene]
    cpos = known_cyst_positions[common_gene]['cysteine-position']
    utils.check_conserved_cysteine(seq, cpos)
    cpos_in_alignment = cpos
    ipos = 0  # position in unaligned sequence
    n_dots_passed = 0  # number of gapped positions in the aligned sequences that we pass before getting to cpos (i.e. while ipos < cpos)
    while ipos < cpos:
        if aligned_seq[ipos + n_dots_passed] in utils.gap_chars:
            cpos_in_alignment += 1
            n_dots_passed += 1
        else:
            ipos += 1
    utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment)
    displacement = cpos_in_alignment - cpos
    print '  cpos displacement: %d' % displacement
    cyst_positions = []
    for gene, seq in unaligned_genes.items():
        
        cyst_positions.append({'gene' : gene, 'cyst_start' : cpos})
Exemple #2
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == 'v'  # others not yet handled
    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False):
            snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
            tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print '        %3d   %s --> %s' % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base}

        seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :]

    utils.check_conserved_cysteine(seq, cpos)
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
def write_cyst_file(known_cyst_positions):
    unaligned_genes = utils.read_germline_seqs(args.dirname, only_region='v')['v']
    aligned_genes = utils.read_germline_seqs(args.dirname, only_region='v', aligned=True)['v']

    known_gene = None  # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions
    for gene, info in known_cyst_positions.items():
        if gene in aligned_genes:
            known_gene = gene
            break
    if known_gene is None:
        raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname))

    known_cpos = known_cyst_positions[known_gene]
    cpos_in_alignment = get_cpos_in_alignment(aligned_genes[known_gene], unaligned_genes[known_gene], known_cpos)
    cyst_positions = {}
    errors = []
    for gene, seq in unaligned_genes.items():
        unaligned_cpos = cpos_in_alignment - utils.count_gaps(aligned_genes[gene], istop=cpos_in_alignment)
        try:
            utils.check_conserved_cysteine(seq, unaligned_cpos, debug=True)
        except:
            print '  %s cysteine not found in %s, skipping' % (utils.color('red', 'warning'), gene)
            # print gene, unaligned_cpos
            # print seq
            # print aligned_genes[gene]
            errors.append(gene)
            continue
        cyst_positions[gene] = unaligned_cpos

    with open(args.dirname + '/' + cyst_fname, 'w') as cystfile:
        writer = csv.DictWriter(cystfile, ('gene', 'istart'))
        writer.writeheader()
        for gene, cpos in cyst_positions.items():
            writer.writerow({'gene' : gene, 'istart' : cpos})
    with open(args.dirname + '/' + error_fname, 'w') as errorfile:
        for gene in errors:
            errorfile.write('%s\n' % gene)
Exemple #4
0
    def process_query(self, bam, reads):
        primary = next((r for r in reads if not r.is_secondary), None)
        query_seq = primary.seq
        query_name = primary.qname
        first_match_query_bounds = None  # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches
        all_match_names = {}
        warnings = {}  # ick, this is a messy way to pass stuff around
        for region in utils.regions:
            all_match_names[region] = []
        all_query_bounds, all_germline_bounds = {}, {}
        n_skipped_invalid_cpos = 0
        for read in reads:  # loop over the matches found for each query sequence
            # set this match's values
            read.seq = query_seq  # only the first one has read.seq set by default, so we need to set the rest by hand
            gene = bam.references[read.tid]
            region = utils.get_region(gene)
            raw_score = read.tags[0][1]  # raw because they don't include the gene choice probs
            score = raw_score
            if self.args.apply_choice_probs_in_sw:  # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well)
                score = self.get_choice_prob(region, gene) * raw_score  # multiply by the probability to choose this gene
            qrbounds = (read.qstart, read.qend)
            glbounds = (read.pos, read.aend)
            if region == 'v' and first_match_query_bounds is None:
                first_match_query_bounds = qrbounds

            # perform a few checks and see if we want to skip this match
            if region == 'v':  # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v)
                cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False)
                if not utils.check_conserved_cysteine(self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False):  # some of the damn cysteine positions in the json file were wrong, so now we check
                    raise Exception('bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene]))
                if cpos < 0 or cpos >= len(query_seq):
                    n_skipped_invalid_cpos += 1
                    continue

            if 'I' in read.cigarstring or 'D' in read.cigarstring:  # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>)
                if len(all_match_names[region]) == 0:  # if this is the first (best) match for this region, allow indels (otherwise skip the match)
                    if query_name not in self.info['indels']:
                        self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.germline_seqs[region][gene][glbounds[0] : glbounds[1]], gene)
                        self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ]
                        self.new_indels += 1
                        # print ' query seq  %s' % query_seq
                        # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq']
                        # self.info['skipped_indel_queries'].append(query_name)
                        # self.info[query_name] = {'indels'}
                    else:
                        print '     multiple indels for %s' % query_name
                    return
                else:
                    continue

            if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]:
                raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1]))

            assert qrbounds[1] <= len(query_seq)
            if glbounds[1] > len(self.germline_seqs[region][gene]):
                print '  ', gene
                print '  ', glbounds[1], len(self.germline_seqs[region][gene])
                print '  ', self.germline_seqs[region][gene]
            assert glbounds[1] <= len(self.germline_seqs[region][gene])
            assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0]

            # and finally add this match's information
            warnings[gene] = ''
            all_match_names[region].append((score, gene))  # NOTE it is important that this is ordered such that the best match is first
            all_query_bounds[gene] = qrbounds
            all_germline_bounds[gene] = glbounds

        # if n_skipped_invalid_cpos > 0:
        #     print '      skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name)
        self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)
    def process_query(self, bam, reads):
        primary = next((r for r in reads if not r.is_secondary), None)
        query_seq = primary.seq
        query_name = primary.qname
        first_match_query_bounds = None  # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches
        all_match_names = {}
        warnings = {}  # ick, this is a messy way to pass stuff around
        for region in utils.regions:
            all_match_names[region] = []
        all_query_bounds, all_germline_bounds = {}, {}
        n_skipped_invalid_cpos = 0
        for read in reads:  # loop over the matches found for each query sequence
            # set this match's values
            read.seq = query_seq  # only the first one has read.seq set by default, so we need to set the rest by hand
            gene = bam.references[read.tid]
            region = utils.get_region(gene)
            raw_score = read.tags[0][
                1]  # raw because they don't include the gene choice probs
            score = raw_score
            if self.args.apply_choice_probs_in_sw:  # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well)
                score = self.get_choice_prob(
                    region, gene
                ) * raw_score  # multiply by the probability to choose this gene
            qrbounds = (read.qstart, read.qend)
            glbounds = (read.pos, read.aend)
            if region == 'v' and first_match_query_bounds is None:
                first_match_query_bounds = qrbounds

            # perform a few checks and see if we want to skip this match
            if region == 'v':  # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v)
                cpos = utils.get_conserved_codon_position(self.cyst_positions,
                                                          self.tryp_positions,
                                                          'v',
                                                          gene,
                                                          glbounds,
                                                          qrbounds,
                                                          assert_on_fail=False)
                if not utils.check_conserved_cysteine(
                        self.germline_seqs['v'][gene],
                        self.cyst_positions[gene]['cysteine-position'],
                        assert_on_fail=False
                ):  # some of the damn cysteine positions in the json file were wrong, so now we check
                    raise Exception(
                        'bad cysteine in %s: %d %s' %
                        (gene, self.cyst_positions[gene]['cysteine-position'],
                         self.germline_seqs['v'][gene]))
                if cpos < 0 or cpos >= len(query_seq):
                    n_skipped_invalid_cpos += 1
                    continue

            if 'I' in read.cigarstring or 'D' in read.cigarstring:  # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>)
                if len(
                        all_match_names[region]
                ) == 0:  # if this is the first (best) match for this region, allow indels (otherwise skip the match)
                    if query_name not in self.info['indels']:
                        self.info['indels'][query_name] = self.get_indel_info(
                            query_name, read.cigarstring,
                            query_seq[qrbounds[0]:qrbounds[1]],
                            self.germline_seqs[region][gene]
                            [glbounds[0]:glbounds[1]], gene)
                        self.info['indels'][query_name][
                            'reversed_seq'] = query_seq[:qrbounds[
                                0]] + self.info['indels'][query_name][
                                    'reversed_seq'] + query_seq[qrbounds[1]:]
                        self.new_indels += 1
                        # print ' query seq  %s' % query_seq
                        # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq']
                        # self.info['skipped_indel_queries'].append(query_name)
                        # self.info[query_name] = {'indels'}
                    else:
                        print '     multiple indels for %s' % query_name
                    return
                else:
                    continue

            if qrbounds[1] - qrbounds[0] != glbounds[1] - glbounds[0]:
                raise Exception(
                    'germline match (%d %d) not same length as query match (%d %d)'
                    % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1]))

            assert qrbounds[1] <= len(query_seq)
            if glbounds[1] > len(self.germline_seqs[region][gene]):
                print '  ', gene
                print '  ', glbounds[1], len(self.germline_seqs[region][gene])
                print '  ', self.germline_seqs[region][gene]
            assert glbounds[1] <= len(self.germline_seqs[region][gene])
            assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0]

            # and finally add this match's information
            warnings[gene] = ''
            all_match_names[region].append(
                (score, gene)
            )  # NOTE it is important that this is ordered such that the best match is first
            all_query_bounds[gene] = qrbounds
            all_germline_bounds[gene] = glbounds

        # if n_skipped_invalid_cpos > 0:
        #     print '      skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name)
        self.summarize_query(query_name, query_seq, all_match_names,
                             all_query_bounds, all_germline_bounds, warnings,
                             first_match_query_bounds)
Exemple #6
0
    if 'N' in align_seq:
        print '\n    WARNING replacing N with A'
        align_seq = align_seq.replace('N', 'A')
    for pos in align_seq:
        if pos not in 'ACGT.':
            print 'ERROR unexpected character %s in %s from %s' % (pos, name,
                                                                   align_fname)
            sys.exit()

    # see if it's too short (WTF?!?!)
    if align_cpos >= len(align_seq):
        print 'too short!'
        bad_genes.append(name)
        continue
    try:
        utils.check_conserved_cysteine(align_seq, align_cpos, debug=True)
    except:
        bad_genes.append(name)
        continue

    # remove dots
    n_dots = align_seq.count('.')
    real_cpos = align_cpos - n_dots
    utils.check_conserved_cysteine(align_seq.replace('.', ''),
                                   real_cpos,
                                   debug=True)
    if name in cyst_positions and real_cpos == cyst_positions[name][
            'cysteine-position']:
        print 'ok'
    else:
        if name in cyst_positions and real_cpos != cyst_positions[name][
    # check for unexpected characters
    if 'N' in align_seq:
        print '\n    WARNING replacing N with A'
        align_seq = align_seq.replace('N', 'A')
    for pos in align_seq:
        if pos not in 'ACGT.':
            print 'ERROR unexpected character %s in %s from %s' % (pos, name, align_fname)
            sys.exit()

    # see if it's too short (WTF?!?!)
    if align_cpos >= len(align_seq):
        print 'too short!'
        bad_genes.append(name)
        continue
    try:
        utils.check_conserved_cysteine(align_seq, align_cpos, debug=True)
    except:
        bad_genes.append(name)
        continue

    # remove dots
    n_dots = align_seq.count('.')
    real_cpos = align_cpos - n_dots
    utils.check_conserved_cysteine(align_seq.replace('.', ''), real_cpos, debug=True)
    if name in cyst_positions and real_cpos == cyst_positions[name]['cysteine-position']:
        print 'ok'
    else:
        if name in cyst_positions and real_cpos != cyst_positions[name]['cysteine-position']:
            print 'not the same, new: %d old: %s' % (real_cpos, cyst_positions[name]['cysteine-position']),
            print '  switching to the new one'
        else:
def get_cpos_in_alignment(aligned_seq, seq, cpos):
    """ given <cpos> in <seq>, find the cysteine's position in <aligned_seq> """
    utils.check_conserved_cysteine(seq, cpos)
    cpos_in_alignment = cpos + get_n_gaps_up_to_cpos(aligned_seq, cpos)
    utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment)
    return cpos_in_alignment
Exemple #9
0
 def choose_position():
     snp_pos = None
     while snp_pos is None or snp_pos in snpd_positions or not utils.check_conserved_cysteine(tmpseq, cpos, debug=True, assert_on_fail=False):
         snp_pos = random.randint(10, len(seq) - 15)  # note that randint() is inclusive
         tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
     return snp_pos