def repl():
    running = True
    while(running):
        command = raw_input(prompt)
        if command == "count bases":
            base_tuple = count_bases(file)
            print "A: " + str(base_tuple[0]) + " C: " + str(base_tuple[1]) + " G: " + str(base_tuple[2]) + " T: " + str(base_tuple[3])
            print "GC%: " + str(base_tuple[4])
        
        elif command == "exit":
            running = False
            exit()
            
        elif command == "graph base count":
            print "Counting bases...",
            base_tuple = count_bases(file)
            print "Done."
            graph_base_count(base_tuple[0], base_tuple[1], base_tuple[2], base_tuple[3])
            
        elif command == "help":
            help()
        
        elif command == "print description":
            print file_description
        
        else:
            print "Command not found. Try again or type \"help\" for help."           
def process_pileups(bamfile, snp_d, read_checklist):
    N = len(snp_d)
    i = 1.0    
    for k in snp_d:
        if (i*100/N) % 5 == 0: 
            sys.stderr.write('%s: %i SNPs processed (%i%%)\n' % (get_timestamp(), i, i/N * 100))
        i += 1.0
        snp = snp_d[k][0]
        snp.test += 1
        base_count = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'U': 0, 'N': 0, 
                      'low_qual': 0, 'del': 0, 'mates_disagree': 0, 'unique_reads': 0} 
        reads_at_pos = dict(zip(base_count.keys(), [[] for key in base_count.keys()]))
        valid_reads = []
        # print   snp.get_region(k[0])  
        for col in bamfile.pileup(region=snp.get_region(k[0])):
            if snp.pos_0 == col.pos:
                base_count, weighted, valid_reads, reads_at_pos = count_bases(col)
                break
        if base_count is None or reads_at_pos is None:
            print 'Skipping', snp.get_region(k[0]) 
            continue
        snp.add_pileup(base_count, reads_at_pos)
        
        # print 'V', valid_reads
        valid_reads = set([(read, base==snp.mutation)
                           for read, base, seq, qpos in valid_reads            
                           if base in (snp.refbase, snp.mutation)])
        read_checklist.update(valid_reads)
    
    # This is a simulation to see how ambiguous SNPs can be removed from the read count
    # print snp_d.keys()
    # snp_d[('Chr4', 10952350)][0].r_support_mutation.append('HWI-ST377:127:D0PHGACXX:7:1306:11252:6941')
    # read_checklist.add(('HWI-ST377:127:D0PHGACXX:7:1306:11252:6941', True))
    # print 'x', sorted(snp_d[('Chr4', 10952350)][0].r_support_refbase[:10])
    # print 'x', sorted(snp_d[('Chr4', 10951240)][0].r_support_refbase[:10])
    
    filtered_reads = set(filter_ambiguous_reads(read_checklist))
    # print len(read_checklist), len(filtered_reads) 
    for k in snp_d:
        snp = snp_d[k][0]
        if snp.is_covered:
            snp.cleanse_counts(filtered_reads)
        
    pass
Exemple #3
0
def analyse_snps(bamfile, snpfile_open, hit_mode, refpath=None, mult_counts=None, with_seqs=False, out=sys.stdout):
    header = ['', 'Pos_SNP', '#UniqueReads', '#Hits', '#Reads_Col', '#Reads_Ped', '#Score_Col', '#Score_Ped',
              '#Reads_N', '#Reads_lowqual', '#Reads_del', '#Reads_other']
    if hit_mode == 'intra':
        header[0] = 'AGI'
    elif hit_mode == 'inter':
        header[0] = 'Contig'
    else:
        sys.stderr.write('Wrong mode: %s. Exiting.\n' % hit_mode)
        sys.exit(1)
        
    if refpath is not None:        
        header.extend(['#match', '#bases', 'MAP_ERR'])
    
    if with_seqs:
        header.extend(['primer_col', 'primer_ped'])
    
    out.write(','.join(map(str, header)) + '\n')
        
    reads_per_gene = {}    
    contig = None
    for line in snpfile_open:
        snp = line.strip().split('\t')
        attributes = dict([attr.split('=') for attr in snp[8].split(';')])
        # next two if-conditions should always be False with new snp data
        # schedule for removal?
        if attributes['refbase'] not in 'ACGTU':
            continue
        if attributes['mutation'] not in 'ACGTU':
            continue
        
        refseq = None
        if refpath is not None:
            if contig is None or snp[0] != contig:            
                ref_fn = os.path.join(refpath, 'TAIR10_chr%c.fas' % snp[0][-1])
                refseq = read_fasta(open(ref_fn, 'rb'))
            
        
        contig, start, end = snp[0], int(snp[3]), int(snp[4])
        region = '%s:%i-%i' % (contig, start, end) 
        
        base_count = None
        flank_counts = []
        for col in bamfile.pileup(region=region):
            if start == col.pos:
                base_count, weighted, valid_reads = count_bases(col, mult_counts=mult_counts)
            elif refseq is not None:
                count, dummy, valid_reads_flank = count_bases(col, mult_counts=mult_counts)                
                try:
                    # this needs to be in a try-block because there might be ambiguity codes... ><;
                    flank_counts.append((count[refseq[col.pos]], len(valid_reads_flank)))
                except:
                    pass
                pass
    
        if base_count is not None:                       
                       
            row = [None, end, 
                   base_count['unique_reads'],
                   sum(base_count.values()) - (base_count['unique_reads'] + base_count['del']),                   
                   base_count[attributes['refbase']],
                   base_count[attributes['mutation']],
                   weighted[attributes['refbase']],
                   weighted[attributes['mutation']],
                   base_count['N'],
                   base_count['low_qual'],
                   base_count['del'],
                   sum([base_count[v] for v in 'ACGT']) - \
                   (base_count[attributes['refbase']] + base_count[attributes['mutation']])]
            
            if refpath is not None:
                n_matches, n_bases = reduce(lambda x,y: (x[0]+y[0], x[1]+y[1]), flank_counts)
                mapping_error = 1.0 - (float(n_matches) / n_bases)
                row.extend([n_matches, n_bases, mapping_error])
            
            if hit_mode == 'intra':
                row[0] = attributes['gene_ID']
                rpg_key = row[0]                                                              
            elif hit_mode == 'inter':
                row[0] = contig
                rpg_key = '%s:%i' % (contig, end)                
            else:
                sys.stderr.write('Wrong mode: %s. Exiting.\n' % hit_mode)
                sys.exit(1)
            
            # list-set-list asserts that reads that cover multiple snps within the same gene
            # are only counted once (unless they disagree at different snp-sites, 
            # which is a different problem) 
            
            seqs = [(seq, qpos, base==attributes['mutation']) 
                    for read, base, seq, qpos in valid_reads
                    if base in (attributes['refbase'], attributes['mutation'])
                    and seq is not None]
            # colseq, pedseq = get_best_read_sequence(seqs)
            if with_seqs:
                colseq, pedseq = get_primer_sequence(seqs)
                row.extend([colseq, pedseq])
            out.write(','.join(map(str, row)) + '\n')
            
            valid_reads = list(set([(read, rpg_key, base==attributes['mutation'])
                                    for read, base, seq, qpos in valid_reads            
                                    if base in (attributes['refbase'], attributes['mutation'])]))
            
            if len(valid_reads) == 0:
                continue
            
            if rpg_key in reads_per_gene:                
                # reads_per_gene[attributes['gene_ID']] = reads_per_gene[attributes['gene_ID']].union(valid_reads)
                reads_per_gene[rpg_key].extend(valid_reads)
            else:
                reads_per_gene[rpg_key] = valid_reads
            
            pass
        pass
    
    pickle.dump(reads_per_gene, open(os.path.basename(bamfile.filename) + '.' + hit_mode + '.rpg.full.dat', 'wb'))
    
    reads_per_gene = count_reads_per_gene(reads_per_gene)
    
    # if hit_mode == 'intra':         
    pickle.dump(reads_per_gene, open(os.path.basename(bamfile.filename) + '.' + hit_mode + '.rpg.dat', 'wb'))
    bamfile.close()
    
    # return analysed_snps, reads_per_gene
    pass