def detect(data, args): in_file = data['r2_path'] out_prefix = data['sample_id'] out_file = out_prefix + "_polyA.dat.gz" out_name_false = out_prefix + "_none.dat.gz" counts = Counter() num_line = 0 logger.my_logger.info("reading file %s" % in_file) logger.my_logger.info("creating files %s %s" % (out_file, out_name_false)) data['detect'] = out_file if os.path.exists(out_file): return data with file_transaction(out_file) as tx_out_file: with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open(out_name_false, 'w') as out_false: for line in handle: #print line num_line += 1 if num_line % 1000000 == 0: logger.my_logger.info("read %s lines:" % num_line) if line.startswith("@HISEQ"): #print line name = line.strip() seq = handle.next().strip() handle.next().strip() qual = handle.next().strip() find = _adapter(seq, qual) #print "%s %s" % (seq, find) if find: seq, qual = find ns = poly_A_percentage(seq) #ns = polyA(seq) if ns: if ns[1]-ns[0] >= 6: #print "positions are" + str(ns[0]) + ".." + str(ns[1]) mod = seq[:ns[0]] seq_polyA = seq[ns[0]:ns[1]] seq_gene = seq[ns[1]:] qual_polyA = qual[ns[0]:ns[1]] qual_gene = qual[ns[1]:] #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf) out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene)) counts['polyA'] += 1 if len(mod) > 0: counts['mod'] += 1 else: counts['shortA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual)) else: counts['noA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual)) else: out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual)) counts['notag'] += 1 with file_transaction(out_prefix + ".stat") as tx_stat_file: df = Series(counts) df.to_csv(tx_stat_file, sep="\t") logger.my_logger.info("%s" % counts) return data
def prep_r2_with_barcode(fq1, fq2, out_file): safe_makedir(os.path.dirname(out_file)) if file_exists(out_file): print ("%s and %s have already been barcode-prepped, skipping." % (fq1, fq2)) return out_file with open_fastq(fq1) as r1_file, open_fastq(fq2) as r2_file: with file_transaction(out_file) as tx_out_file: out_handle = open(tx_out_file, "w") read_count = 0 buf = list() r1_r2 = itertools.izip(r1_file, r2_file) for header1, header2 in r1_r2: seq1, seq2 = r1_r2.next() plus1, plus2 = r1_r2.next() qual1, qual2 = r1_r2.next() read_name1, read_name2 = header1.split()[0][1:], header2.split()[0][1:] assert read_name1 == read_name2, "FASTQ files may be out of order." seq2, qual2 = seq2.rstrip(), qual2.rstrip() barcode, seq, qual = mask(seq1[0:6], qual1[0:6], min_qual=10) + \ mask(seq1[6:], qual1[6:]), seq2, qual2 barcoded_name = ":".join([read_name2, barcode]) print(format_fastq([barcoded_name, seq, qual]), file=out_handle) out_handle.close() return out_file
def _summarize(in_file, align_r2, count_file, out_file): log_file = out_file + ".log" logger.my_logger.info("summarize results") read_gene, counts_gene = _get_first_read(count_file) logger.my_logger.info("load read 1") read_gene = _get_second_read(align_r2, read_gene) logger.my_logger.info("load read 2") stats = defaultdict(Counter) if not os.path.exists(out_file): with gzip.open(in_file) as handle_polya: log_handle = open(log_file, 'w') for line in handle_polya: cols = line.strip().split("\t") read = cols[0].split(" ")[0].replace("@", "") if read in read_gene: find = tune(cols[3], cols[4]) if len(cols[3] + cols[4] + cols[6]) > 135: continue if find: log_handle.write("%s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read])) if read_gene[read][1]: #print "is polya" gene = read_gene[read][0] stats[gene]["polyA"] += 1 if find[0] != "": stats[gene][find[0]] += 1 with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out: for gene in counts_gene: out.write("%s counts %s\n" % (gene, counts_gene[gene])) if gene in stats: for mod, c in stats[gene].iteritems(): out.write("%s %s %s\n" % (gene, mod, c))
def rmdup(align_file, out_file): cmd = ("samtools view -bh {align_file} | samtools sort -o -n - {tmp} | bammarkduplicates rmdup=1 O={tx_out_file}") tmp = align_file + "_tmp" if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals())) return out_file
def _bowtie_align(fastq_file, control_index, out_file): cmd = ("bowtie2 -p 4 --no-unal -x {control_index} -U {fastq_file} | samtools view -Shb /dev/stdin > {tx_out_file} ") stat_file = out_file + ".flagstat" if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "bowtie2 %s" % fastq_file) do.run("samtools flagstat {out_file} > {stat_file}".format(**locals()), "stats control sequences") return stat_file
def rmdup(align_file, out_file): cmd = ( "samtools view -bh {align_file} | samtools sort -o -n - {tmp} | bammarkduplicates rmdup=1 O={tx_out_file}" ) tmp = align_file + "_tmp" if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals())) return out_file
def count_umi(sam_file, gtf_file, barcode_to_well, multimappers=False): """ stripped down implementation of the HTSeq algorithm for counting """ base, _ = os.path.splitext(sam_file) out_file = base + ".counts" out_umi_file = base + ".counts_umi.gz" out_umi_pos_file = base + ".counts_umi_pos.gz" if file_exists(out_file): return out_file wells = sorted(barcode_to_well.values()) seen_umi = defaultdict(set) seen_umi_list = defaultdict(Counter) seen_umi_pos_list = defaultdict(Counter) exons = HTSeq.GenomicArrayOfSets("auto", stranded=False) gtf_handle = HTSeq.GFF_Reader(gtf_file) for feature in gtf_handle: if feature.type == "exon": exons[feature.iv] += feature.attr["gene_id"] sam_handle = HTSeq.SAM_Reader(sam_file) for read in sam_handle: if not read.aligned: continue if not multimappers: try: if read.optional_field("NH") > 1: continue except KeyError: pass iv_seq = (co.ref_iv for co in read.cigar if co.type == "M" and co.size > 0) fs = set() for iv in iv_seq: for iv2, fs2 in exons[iv].steps(): if not fs: fs = fs2.copy() else: fs = fs.intersection(fs2) if len(fs) == 1: fields = read.original_sam_line.split("\t") position = "%s:%s" % (fields[2], fields[3]) barcode, umi = get_barcode_and_umi(read) if barcode not in barcode_to_well: continue seen_umi[(list(fs)[0], barcode_to_well[barcode])].add(umi) seen_umi_list[(list(fs)[0], barcode_to_well[barcode])][umi] += 1 seen_umi_pos_list[(position, barcode_to_well[barcode], list(fs)[0])][umi] += 1 write_extensive_summary(seen_umi_list, out_umi_file) write_extensive_summary_by_pos(seen_umi_pos_list, out_umi_pos_file) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: print("\t".join(["feature"] + wells), file=out_handle) for feature in get_feature_names(gtf_file): counts = [len(seen_umi[(feature, well)]) for well in wells] print("\t".join([feature] + map(str, counts)), file=out_handle)
def _assign_gene(in_file, prefix): """read featureCounts output and assign each read a gene""" out_file = prefix + "assign.dat" if not os.path.exists(out_file): with open(in_file) as handle, file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out: for line in handle: cols = line.strip().split("\t") if cols[1] == "Assigned": out.write("%s\t%s\n" % (cols[0], cols[2])) return out_file
def _bowtie_align(fastq_file, control_index, out_file): cmd = ( "bowtie2 -p 4 --no-unal -x {control_index} -U {fastq_file} | samtools view -Shb /dev/stdin > {tx_out_file} " ) stat_file = out_file + ".flagstat" if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "bowtie2 %s" % fastq_file) do.run("samtools flagstat {out_file} > {stat_file}".format(**locals()), "stats control sequences") return stat_file
def _summarize(in_file, align_r2, count_file, align_r1, out_file): if not os.path.exists(out_file): log_file = out_file + ".log" logger.my_logger.info("summarize results") read_gene, counts_gene = _get_first_read(count_file) read_position = _get_read1_position(align_r1, read_gene) logger.my_logger.info("load read 1 done") read_gene = _get_second_read(align_r2, read_gene) logger.my_logger.info("load read 2 done") stats = defaultdict(Counter) duplicate = {} find_dup = 0 with gzip.open(in_file) as handle_polya: log_handle = open(log_file, 'w') for line in handle_polya: cols = line.strip().split("\t") read = cols[0].split(" ")[1].replace("@", "") primer = cols[0].split(" ")[0] if read in read_gene: log_handle.write("found %s %s %s %s ---> %s\n" % (read, primer, cols[3], cols[4], read_gene[read])) if read_gene[read][1]: if len(cols[3] + cols[4] + cols[6]) > 135: continue find = tune(cols[3], cols[4]) pos = read_position[read][0] log_handle.write("log %s %s %s %s\n" % (read, primer, pos, read_gene[read])) if (pos, primer) in duplicate: find_dup += 1 continue if find and not (pos, primer) in duplicate: duplicate[(pos, primer)] = 0 log_handle.write("corrected %s %s %s --->%s %s\n" % (read, cols[3], cols[4], find, read_gene[read])) #print "is polya" gene = read_gene[read][0] polya_size = _get_bin(len(find[1])) stats[gene][polya_size] += 1 if find[0] != "": stats[gene][(polya_size, find[0])] += 1 else: log_handle.write("removed %s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read])) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out: for gene in counts_gene: out.write("%s total counts %s 0\n" % (gene, counts_gene[gene])) if gene in stats: for mod, c in stats[gene].iteritems(): if isinstance(mod, tuple): u_times = _get_u_times(mod[1]) out.write("%s %s %s %s %s\n" % (gene, mod[0], mod[1], c, u_times)) else: out.write("%s polyA %s %s 0\n" % (gene, mod, c)) logger.my_logger.info("Found %s exact duplicates\n" % find_dup)
def bwa_align(fastq_path, reference_prefix, out_file, cores=1): edit_distance = MAX_EDIT_DISTANCE if file_exists(out_file): print ("%s has already been aligned, skipping." % (fastq_path)) return out_file with file_transaction(out_file) as tx_out_file: cmd = ("bwa aln -n {edit_distance} -l 24 {reference_prefix} " "{fastq_path} -t {cores} | bwa samse {reference_prefix} - {fastq_path} " "> {tx_out_file}").format(**locals()) subprocess.check_call(cmd, shell=True) return out_file
def _get_counts_stats(count_file, out_file): seen = {} stats = Counter() if os.path.exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: with open(count_file) as in_handle: for line in in_handle: read, label = line.strip().split("\t")[:2] if read not in seen: stats[label] += 1 seen[read] = 0 with open(tx_out_file, "w") as out_handle: for label in stats: out_handle.write("%s %s\n" % (label, stats[label])) return out_file
def clean_align(align_file, out_file): if file_exists(out_file): logger.my_logger.info("%s has already been cleaned, skipping." % (align_file)) return out_file count_total_reads = 0 count_assigned_reads = 0 count_assigned_aligned_reads = 0 with pysam.Samfile(align_file, "r") as in_handle, file_transaction(out_file) as tx_out_file: out_handle = pysam.Samfile(tx_out_file, "wh", template=in_handle) for read in in_handle: count_total_reads += 1 count_assigned_reads += 1 if poorly_mapped_read(read): continue count_assigned_aligned_reads += 1 out_handle.write(read) out_handle.close return out_file
def _summarize(in_file, align_r2, count_file, out_file): log_file = out_file + ".log" logger.my_logger.info("summarize results") read_gene, counts_gene = _get_first_read(count_file) logger.my_logger.info("load read 1 done") read_gene = _get_second_read(align_r2, read_gene) logger.my_logger.info("load read 2 done") stats = defaultdict(Counter) if not os.path.exists(out_file): with gzip.open(in_file) as handle_polya: log_handle = open(log_file, 'w') for line in handle_polya: cols = line.strip().split("\t") read = cols[0].split(" ")[0].replace("@", "") if read in read_gene: log_handle.write("found %s %s %s ---> %s\n" % (read, cols[3], cols[4], read_gene[read])) if read_gene[read][1]: if len(cols[3] + cols[4] + cols[6]) > 135: continue find = tune(cols[3], cols[4]) if find: log_handle.write("corrected %s %s %s --->%s %s\n" % (read, cols[3], cols[4], find, read_gene[read])) #print "is polya" gene = read_gene[read][0] polya_size = _get_bin(len(find[1])) stats[gene][polya_size] += 1 if find[0] != "": stats[gene][(polya_size, find[0])] += 1 else: log_handle.write("removed %s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read])) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out: for gene in counts_gene: out.write("%s total counts %s 0\n" % (gene, counts_gene[gene])) if gene in stats: for mod, c in stats[gene].iteritems(): if isinstance(mod, tuple): u_times = _get_u_times(mod[1]) out.write("%s %s %s %s %s\n" % (gene, mod[0], mod[1], c, u_times)) else: out.write("%s polyA %s %s 0\n" % (gene, mod, c))
def clean_align(align_file, out_file): if file_exists(out_file): logger.my_logger.info("%s has already been cleaned, skipping." % (align_file)) return out_file count_total_reads = 0 count_assigned_reads = 0 count_assigned_aligned_reads = 0 with pysam.Samfile( align_file, "rb") as in_handle, file_transaction(out_file) as tx_out_file: out_handle = pysam.Samfile(tx_out_file, "wh", template=in_handle) for read in in_handle: count_total_reads += 1 count_assigned_reads += 1 if poorly_mapped_read(read): continue count_assigned_aligned_reads += 1 out_handle.write(read) out_handle.close return out_file
def detect(data, args): in_file = data['r2_path'] out_prefix = data['sample_id'] out_file = out_prefix + "_polyA.dat.gz" out_name_false = out_prefix + "_none.dat.gz" counts = Counter() num_line = 0 logger.my_logger.info("reading file %s" % in_file) logger.my_logger.info("creating files %s %s" % (out_file, out_name_false)) data['detect'] = out_file if os.path.exists(out_file): return data with file_transaction(out_file) as tx_out_file: with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open( out_name_false, 'w') as out_false: for line in handle: #print line num_line += 1 if num_line % 1000000 == 0: logger.my_logger.info("read %s lines:" % num_line) if line.startswith("@HISEQ"): #print line name = line.strip() seq = handle.next().strip() handle.next().strip() qual = handle.next().strip() find = _adapter(seq, qual) #print "%s %s" % (seq, find) if find: seq, qual = find ns = poly_A_percentage(seq) #ns = polyA(seq) if ns: if ns[1] - ns[0] >= 6: #print "positions are" + str(ns[0]) + ".." + str(ns[1]) mod = seq[:ns[0]] seq_polyA = seq[ns[0]:ns[1]] seq_gene = seq[ns[1]:] qual_polyA = qual[ns[0]:ns[1]] qual_gene = qual[ns[1]:] #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf) out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene)) counts['polyA'] += 1 if len(mod) > 0: counts['mod'] += 1 else: counts['shortA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual)) else: counts['noA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual)) else: out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual)) counts['notag'] += 1 with file_transaction(out_prefix + ".stat") as tx_stat_file: df = Series(counts) df.to_csv(tx_stat_file, sep="\t") logger.my_logger.info("%s" % counts) return data
def write_extensive_summary_by_pos(well_umi_gen, out_file): with file_transaction(out_file) as tx_out_file: with gzip.open(tx_out_file, 'wb') as out_handle: well_umi_gen_str = [[("\t%s\t%s\t%s\t" % (gen_well[0], gen_well[1], gen_well[2])).join(map(str, umi)) for umi in well_umi_gen[gen_well].items()] for gen_well in well_umi_gen] out_handle.write("\n".join(["\n".join(item) for item in well_umi_gen_str])) out_handle.write("\n")