def repl(): running = True while(running): command = raw_input(prompt) if command == "count bases": base_tuple = count_bases(file) print "A: " + str(base_tuple[0]) + " C: " + str(base_tuple[1]) + " G: " + str(base_tuple[2]) + " T: " + str(base_tuple[3]) print "GC%: " + str(base_tuple[4]) elif command == "exit": running = False exit() elif command == "graph base count": print "Counting bases...", base_tuple = count_bases(file) print "Done." graph_base_count(base_tuple[0], base_tuple[1], base_tuple[2], base_tuple[3]) elif command == "help": help() elif command == "print description": print file_description else: print "Command not found. Try again or type \"help\" for help."
def process_pileups(bamfile, snp_d, read_checklist): N = len(snp_d) i = 1.0 for k in snp_d: if (i*100/N) % 5 == 0: sys.stderr.write('%s: %i SNPs processed (%i%%)\n' % (get_timestamp(), i, i/N * 100)) i += 1.0 snp = snp_d[k][0] snp.test += 1 base_count = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'U': 0, 'N': 0, 'low_qual': 0, 'del': 0, 'mates_disagree': 0, 'unique_reads': 0} reads_at_pos = dict(zip(base_count.keys(), [[] for key in base_count.keys()])) valid_reads = [] # print snp.get_region(k[0]) for col in bamfile.pileup(region=snp.get_region(k[0])): if snp.pos_0 == col.pos: base_count, weighted, valid_reads, reads_at_pos = count_bases(col) break if base_count is None or reads_at_pos is None: print 'Skipping', snp.get_region(k[0]) continue snp.add_pileup(base_count, reads_at_pos) # print 'V', valid_reads valid_reads = set([(read, base==snp.mutation) for read, base, seq, qpos in valid_reads if base in (snp.refbase, snp.mutation)]) read_checklist.update(valid_reads) # This is a simulation to see how ambiguous SNPs can be removed from the read count # print snp_d.keys() # snp_d[('Chr4', 10952350)][0].r_support_mutation.append('HWI-ST377:127:D0PHGACXX:7:1306:11252:6941') # read_checklist.add(('HWI-ST377:127:D0PHGACXX:7:1306:11252:6941', True)) # print 'x', sorted(snp_d[('Chr4', 10952350)][0].r_support_refbase[:10]) # print 'x', sorted(snp_d[('Chr4', 10951240)][0].r_support_refbase[:10]) filtered_reads = set(filter_ambiguous_reads(read_checklist)) # print len(read_checklist), len(filtered_reads) for k in snp_d: snp = snp_d[k][0] if snp.is_covered: snp.cleanse_counts(filtered_reads) pass
def analyse_snps(bamfile, snpfile_open, hit_mode, refpath=None, mult_counts=None, with_seqs=False, out=sys.stdout): header = ['', 'Pos_SNP', '#UniqueReads', '#Hits', '#Reads_Col', '#Reads_Ped', '#Score_Col', '#Score_Ped', '#Reads_N', '#Reads_lowqual', '#Reads_del', '#Reads_other'] if hit_mode == 'intra': header[0] = 'AGI' elif hit_mode == 'inter': header[0] = 'Contig' else: sys.stderr.write('Wrong mode: %s. Exiting.\n' % hit_mode) sys.exit(1) if refpath is not None: header.extend(['#match', '#bases', 'MAP_ERR']) if with_seqs: header.extend(['primer_col', 'primer_ped']) out.write(','.join(map(str, header)) + '\n') reads_per_gene = {} contig = None for line in snpfile_open: snp = line.strip().split('\t') attributes = dict([attr.split('=') for attr in snp[8].split(';')]) # next two if-conditions should always be False with new snp data # schedule for removal? if attributes['refbase'] not in 'ACGTU': continue if attributes['mutation'] not in 'ACGTU': continue refseq = None if refpath is not None: if contig is None or snp[0] != contig: ref_fn = os.path.join(refpath, 'TAIR10_chr%c.fas' % snp[0][-1]) refseq = read_fasta(open(ref_fn, 'rb')) contig, start, end = snp[0], int(snp[3]), int(snp[4]) region = '%s:%i-%i' % (contig, start, end) base_count = None flank_counts = [] for col in bamfile.pileup(region=region): if start == col.pos: base_count, weighted, valid_reads = count_bases(col, mult_counts=mult_counts) elif refseq is not None: count, dummy, valid_reads_flank = count_bases(col, mult_counts=mult_counts) try: # this needs to be in a try-block because there might be ambiguity codes... ><; flank_counts.append((count[refseq[col.pos]], len(valid_reads_flank))) except: pass pass if base_count is not None: row = [None, end, base_count['unique_reads'], sum(base_count.values()) - (base_count['unique_reads'] + base_count['del']), base_count[attributes['refbase']], base_count[attributes['mutation']], weighted[attributes['refbase']], weighted[attributes['mutation']], base_count['N'], base_count['low_qual'], base_count['del'], sum([base_count[v] for v in 'ACGT']) - \ (base_count[attributes['refbase']] + base_count[attributes['mutation']])] if refpath is not None: n_matches, n_bases = reduce(lambda x,y: (x[0]+y[0], x[1]+y[1]), flank_counts) mapping_error = 1.0 - (float(n_matches) / n_bases) row.extend([n_matches, n_bases, mapping_error]) if hit_mode == 'intra': row[0] = attributes['gene_ID'] rpg_key = row[0] elif hit_mode == 'inter': row[0] = contig rpg_key = '%s:%i' % (contig, end) else: sys.stderr.write('Wrong mode: %s. Exiting.\n' % hit_mode) sys.exit(1) # list-set-list asserts that reads that cover multiple snps within the same gene # are only counted once (unless they disagree at different snp-sites, # which is a different problem) seqs = [(seq, qpos, base==attributes['mutation']) for read, base, seq, qpos in valid_reads if base in (attributes['refbase'], attributes['mutation']) and seq is not None] # colseq, pedseq = get_best_read_sequence(seqs) if with_seqs: colseq, pedseq = get_primer_sequence(seqs) row.extend([colseq, pedseq]) out.write(','.join(map(str, row)) + '\n') valid_reads = list(set([(read, rpg_key, base==attributes['mutation']) for read, base, seq, qpos in valid_reads if base in (attributes['refbase'], attributes['mutation'])])) if len(valid_reads) == 0: continue if rpg_key in reads_per_gene: # reads_per_gene[attributes['gene_ID']] = reads_per_gene[attributes['gene_ID']].union(valid_reads) reads_per_gene[rpg_key].extend(valid_reads) else: reads_per_gene[rpg_key] = valid_reads pass pass pickle.dump(reads_per_gene, open(os.path.basename(bamfile.filename) + '.' + hit_mode + '.rpg.full.dat', 'wb')) reads_per_gene = count_reads_per_gene(reads_per_gene) # if hit_mode == 'intra': pickle.dump(reads_per_gene, open(os.path.basename(bamfile.filename) + '.' + hit_mode + '.rpg.dat', 'wb')) bamfile.close() # return analysed_snps, reads_per_gene pass