def main(args): ## ## check input files ## if not isfile(args.bam): ms.error_message("Input file does not exist") return 1 if args.o is not None: if isfile(args.o): ms.message("Output file exists. Overwriting.") if args.z: if not re.search("\.gz$", args.o): args.o += ".gz" else: if re.search("\.gz$", args.o): # auto enable gzip if the output file name has .gz at the end args.z = True rres = core(args) return 0
def main(args): if not isfile(args.batch_file): ms.error_message("Input batch file does not exist ({})".format( args.batch_file)) return 1 flist = [] err_flag = False ms.message("Checking batch file") t0 = time() # load the batch file and confirm all of the input files exist. with open(args.batch_file, "r") as fin: for szl in fin: fname = szl.strip() if not isfile(fname): err_flag = True ms.error_message("{} does not exist".format(fname)) else: flist.append(fname) ms.time_diff(t0) if err_flag: return 1 rres = core(flist) return 1
def core(args): dtid = {} ms.message("Parsing transcript entries in GTF") with open(args.gtf, "r") as fin: # parse the GTF to find the 'transcript' rows and gather info for szl in fin: if szl[0] == "#": continue grow = gtf_parseline(szl) if grow['type'] == "transcript": tid = grow['attrs']['transcript_id'] dtid[tid] = grow['attrs'] # finished. now we can pass back through and parse the exon rows out szout = "" ms.message("Incorporating transcript annotations into exon fields") with open(args.gtf, "r") as fin: for szl in fin: if szl[0] == "#": szout += szl continue grow = gtf_parseline(szl) if grow['type'] == "exon": tid = grow['attrs']['transcript_id'] if tid in dtid: # combine annotation information rres = merge_annot(grow['attrs'], dtid[tid]) grow['attrs'] = copy.deepcopy(rres) szout += gtf_row_tostring(grow) szout += "\n" if args.o is not None: with open(args.o, "w") as fout: fout.write(szout) else: sys.stdout.write(szout) return 0
def main(args): # check input file if not os.path.isfile(args.bam): ms.error_message("Input file does not exist") return 1 if not re.search("\.bam$", args.bam): ms.error_message("Input file should be a BAM file.") return 1 ## ## check for output folder ## if not os.path.isdir(args.outpath): ms.message("Creating output folder {}".format(args.outpath)) os.mkdir(args.outpath) rres = core(args) return rres
def main(args): # check input file if not os.path.isfile(args.gtf): ms.error_message("Input file is missing") return 1 t0 = time() ms.message("Loading annotation") dtid, dtid2gname, all_attrs, tid_order = load(args.gtf) ms.time_diff(t0) tmp = all_attrs.difference(set(["transcript_id", "gene_id", "gene_name"])) all_attrs = sorted(list(tmp)) # header sys.stdout.write( "chrom\tdb\tfeature\tstart\tend\tscore\tstrand\tframe\ttranscript_id\tgene_id\tgene_name\t" ) sys.stdout.write("\t".join(all_attrs)) sys.stdout.write("\n") for tid in tid_order: for gg in dtid[tid]: # print each row lout = gg.parts[0:8] lout.append(gg.transcript_id()) lout.append(gg.gene_id()) lout.append(gg.gene_name()) for aid in all_attrs: if aid in gg.attr: lout.append(gg.attr[aid]) else: lout.append("na") sys.stdout.write("\t".join(lout) + "\n") return 0
def usage(): sz = "usage: bam-stats.py <sam/bam>" return sz #============================================================================== # main #============================================================================== if __name__ == "__main__": argv = sys.argv argc = len(sys.argv) if argc < 2: ms.message(usage()) sys.exit(1) argv = argv[1:len(argv)] stat_list = [] # t0 = time() # lktable2 = ta.build_lktable() # ms.time_diff(t0) for i in range(len(argv)): if not isfile(argv[i]): ms.error_message("file does not exist ({})".format(argv[i])) continue
def main(args): # variables bam_file = False bc = {} bc_readcount = defaultdict(int) bc_umi = {} num_bc = 0 offset = 0 lnum = 0 sz_umi = "" umi_file = "" # string to capture file summary table that's used with kallisto pseudo -b sz_table = "#id\tumiFile\tcellFile\n" dumi = None bam_flag = False sam_header = "" quant_mode = False ## ## check for output folder ## if not os.path.isdir(args.outpath): ms.message("Creating output folder {}".format(args.outpath)) os.mkdir(args.outpath) file_queue = JoinableQueue() p = None pool = [] if args.R is not None: if not os.path.isfile(args.R): ms.error_message( "Supplied annotation file does not exist ({})".format(args.R)) return 1 else: quant_mode = True ## ## figure out if we have a bam as input. if so we have to convert it to sam for indexing ## if re.search("\.bam$", args.fin): # send the sam file into the output folder sam_name = args.outpath + "/" + os.path.basename( re.sub("\.bam$", ".sam", args.fin)) if not os.path.isfile(sam_name): # need to convert alignments to sam bam_flag = True cmd = "samtools view -h {} > {}".format(args.fin, sam_name) t0 = time() message("Temporarily converting BAM to SAM format") rres = runcmd(cmd) if rres[0] != 0: sys.stderr.write( "Error: samtools exited with non-zero exit status!\n") return 1 sys.stderr.write("{} sec\n".format(time() - t0)) else: sam_name = args.fin ## ## we need to index all barcodes and track umi per barcode. if these pickle ## files exist we can use them ## bc_pkl = args.outpath + "/" + BC_PICKLE bc_umi_pkl = args.outpath + "/" + BC_UMI_PICKLE bc_readcount_pkl = args.outpath + "/" + BC_READCOUNT sam_header_pkl = args.outpath + "/sam_header.pkl" if os.path.isfile(bc_pkl) and os.path.isfile( bc_umi_pkl) and os.path.isfile(bc_readcount_pkl): ## # load indexes from pickles ms.message( "Loading existing barcode and umi indexes from output folder") t0 = time() bc = pickle.load(open(bc_pkl, "rb")) bc_umi = pickle.load(open(bc_umi_pkl, "rb")) bc_readcount = pickle.load(open(bc_readcount_pkl, "rb")) sam_header = pickle.load(open(sam_header_pkl, "rb")) num_bc = len(bc.keys()) ms.time_diff(t0) else: # we have to index # # parse the alignments. in this loop we only extract the cell barcode and the umi # plus record the file position offsets for barcodes. the dict that is built # is indexed by the barcodes and each element contains a list of file offsets for # reads that came from that barcode. we also get all of the distinct umis collected # per barcode in this loop in order to estimate the actual cell count before # writing all of the read files out to disk message('Indexing cell barcodes from alignments and counting raw UMI.') t0 = time() with open(sam_name, "r") as fin: for szl in fin: if szl[0] == "@": # append header line to header string sam_header += szl offset += len(szl) continue # count lines and produce progress message so we know this thing is # running lnum += 1 if lnum % 1000000 == 0: progress_message("read {} lines".format(lnum)) # fetch the cell barcode from the read name line_bc = parse_barcode(szl) if line_bc not in bc: # first encounter with this barcode num_bc += 1 # init a list for this barcode's line offsets within this sam file bc[line_bc] = [] # init a dict for the barcode to track umis bc_umi[line_bc] = defaultdict(int) # append line offset to this barcode's list bc[line_bc].append(offset) # get the umi and add it to this barcode's dict IF this is not a # secondary alignment aln = szl.split("\t") if (int(aln[1]) & 0x100) == 0: # not a secondary alignment. track it. umi = parse_umi(szl) bc_umi[line_bc][umi] += 1 if ((int(aln[1]) & 0x4) == 0) and ((int(aln[1]) & 0x100) == 0): # this read is aligned and is a primary alignment so we can count this one # into this barcode's aligned read count bc_readcount[line_bc] += 1 # update offset to the next line offset += len(szl) # final progress message and total time of parsing progress_message("read {} lines".format(lnum), last=True) sys.stderr.write("{} sec\n".format(time() - t0)) t0 = time() if not args.no_pickles: ms.message("saving indexes to disk") pickle.dump(bc, open(bc_pkl, "wb")) pickle.dump(bc_umi, open(bc_umi_pkl, "wb")) pickle.dump(bc_readcount, open(bc_readcount_pkl, "wb")) pickle.dump(sam_header, open(sam_header_pkl, "wb")) ms.time_diff(t0) # # implement cell number detection per 10x. # here's what happens. you take the 'exp-cells' value (expected cells) # and multiply that by 0.01 to get an index. sort the barcodes and the # barcode umi counts in descending order and jump to the index you just # calculated and then take that index's umi count. scale that count # by 0.1. now you take as many cells, starting from the top of the umi # count sorted list, that have at least that many UMI. that's literally # how they do it. # t0 = time() message("Determining cell count") # # write a file that will contain the cell id, umi count and read count # for each cell id. might be informative...who knows. with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout: bc_umi_counts = [] fout.write("barcode\tumi_count\tdistinct_reads\talignments\n") for lbc in bc.keys(): num_umi = len(bc_umi[lbc].keys()) bc_umi_counts.append([lbc, num_umi]) # write the cell id, distinct umi count and total read count to file fout.write("\t".join( map(str, [lbc, num_umi, bc_readcount[lbc], len(bc[lbc])]))) fout.write("\n") # # sort by umi count in descending order and threshold bc_umi_counts.sort(key=lambda x: x[1], reverse=True) exp_cells = int(math.floor(args.exp_cells * 0.01 - 1)) num_reads = 0 num_umi = 0 i = 0 while True: if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10: break # count umi and count distinct reads lbc = bc_umi_counts[i][0] num_reads += bc_readcount[lbc] num_umi += len(bc_umi[lbc].keys()) i += 1 # # number of actual cells is 'i' because 'i' is incremented before # checking if the umi count passes the threshold. i-1 is the index # of the last cell we would accept num_cells = i # # now we can generate a summary for the detected cells with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout: fout.write("estimated_cells\t{}\n".format(num_cells)) fout.write("total_reads\t{}\n".format(num_reads)) fout.write("total_umi\t{}\n".format(num_umi)) fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells)) fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells)) # find the median barcode and corresponding read count if num_cells % 2 == 0: # even count median_idx = num_cells / 2 else: median_idx = num_cells / 2 + 1 median_lbc = bc_umi_counts[median_idx][0] fout.write("median_reads_per_cell\t{}\n".format( bc_readcount[median_lbc])) # # let user know what's up sys.stderr.write("{} sec\n".format(time() - t0)) sys.stderr.write("Total distinct barcodes: {}\n".format(num_bc)) sys.stderr.write("Cell number estimate: {}\n".format(num_cells)) if args.estimate_only: if bam_flag: # input was BAM so we can dump the converted file. just putting in # some logic to be certain that the original file is not deleted. if os.path.isfile(args.fin) and os.path.isfile(sam_name) and ( sam_name != args.fin): os.unlink(sam_name) ms.message("Done.") return 0 if args.force_cells is not None: # change number of cells to either the total barcodes or the # value provided by the user, whichever is smaller num_cells = min([args.force_cells, num_bc]) sys.stderr.write("Forced cell output: {}\n".format(num_cells)) t0 = time() message( "Parsing individual detected cell alignments out to individual files") if quant_mode: # start quantificaion child processes for parsed sam files for i in range(args.p): p = Process(target=quantification_worker, args=( file_queue, args, )) p.daemon = True p.start() pool.append(p) else: # start child process for sam to bam conversion for i in range(args.p): p = Process(target=compress_reads, args=(file_queue, )) p.daemon = True p.start() pool.append(p) fin = open(sam_name, "r") # write individual cell files i = 0 sz_umi = "" while i < num_cells: # get barcode lbc = bc_umi_counts[i][0] # start output strings szout = sam_header #sz_umi = "" # setup output file name cell_file = "{}/{}.sam".format(args.outpath, lbc) #umi_file = "{}.umi".format(lbc) # update user on progress progress_message("Writing {} - {}/{} ({} reads)".format( cell_file, i + 1, num_cells, len(bc[lbc]))) if args.samplerate < 1 and args.samplerate > 0: ## # to subsample we have to run through all read offsets for this cell and index the reads # then take a subset of them to write out to disk. I have to do this because the # alignment file contains secondary alignments which have to be collapsed by # read name prior to the subsampling. read_index = defaultdict(list) for offset in bc[lbc]: fin.seek(offset) aln = fin.readline().strip().split("\t") rname = aln[0] read_index[rname].append(offset) # # now by looping through distinct reads we can dump out only those that are at the specified rate for rname in read_index.keys(): if random.random() > args.samplerate: continue # dump this read for offset in read_index[rname]: fin.seek(offset) szout += fin.readline() else: # loop through line offsets for this barcode and append lines to the output string for offset in bc[lbc]: fin.seek(offset) szout += fin.readline() # write the file with open(cell_file, "w") as fout: fout.write(szout) # send the file off for bam compression file_queue.put(cell_file) i += 1 fin.close() sys.stderr.write("\n") sys.stderr.write("{} sec\n".format(time() - t0)) if bam_flag: # input was BAM so we can dump the converted file. just putting in # some logic to be certain that the original file is not deleted. if os.path.isfile(args.fin) and os.path.isfile(sam_name) and ( sam_name != args.fin): os.unlink(sam_name) sys.stderr.write("Waiting for child process to finish compressing files\n") for p in pool: file_queue.put(None) file_queue.join() for p in pool: p.join() message("finished!") return 0
def core(left, right, args): ## ## create queue and child process for compressing the fastq files ## bc_len = args.barcode_length umi_len = args.umi_length tasks = JoinableQueue() p = Process(target=gz_worker, args=(tasks, )) p.daemon = True p.start() ## ## input files are paired from the sequencer so we just have to read through them ## and write them back out for i in range(len(left)): m1 = left[i] m2 = right[i] # pick apart the name of the second file to build the output name base = basename(m2) path = dirname(m2) base_parts = base.split(".") stub = base_parts[0] outfile = "{}_prepped.fastq".format(stub) if outfile == m1 or outfile == m2: ms.error_message( "Output file path matches input file path. WTF? {}".format( outfile)) sys.exit(1) if isfile(outfile): ms.warning_message( "output file exists. overwriting. {}".format(outfile)) fout = open(outfile, "w") try: ms.message("Processing {}".format(m1)) with open_reads(m1) as fin1, open_reads(m2) as fin2: nidx = 0 nreads = 0 lread = [] for szl2 in fin2: if (nreads % 1000000) == 0: ms.progress_message("Parsed {} reads".format(nreads)) nidx += 1 if nidx == 1: # read name line rname = szl2.strip().split() rname = rname[0] # read two lines from the barcode file szl1 = fin1.readline() szl1 = fin1.readline().strip() # this is the barcode so we can pick it apart. I'm going to put the cell barcode # at the front of the read so that I can maybe leverage samtools sort to sort # barcodes together for me prior to parsing cells out rname_tmp = re.sub("^\@", "", rname) rname = "@{}:{}".format(szl1[0:bc_len], rname_tmp) # 20180226 # moved the cell barcode to the front of the read name so we only # need to write the umi at the end and not both #rname += ":{}:{}".format(szl1[0:16], szl1[16:len(szl1)]) rname += ":{}".format(szl1[bc_len:(bc_len + umi_len)]) lread.append(rname + "\n") # read the remaining lines for this read from the barcodes file szl1 = fin1.readline() szl1 = fin1.readline() elif nidx < 4: lread.append(szl2) if nidx == 4: # finished with read lread.append(szl2) fout.write("".join(lread)) nidx = 0 lread = [] nreads += 1 ms.progress_message("Parsed {} reads".format(nreads), last=True) except: fout.close() sys.exit(1) fout.close() #ms.message("compressing {}".format(outfile)) #system("gzip -f {}".format(outfile)) tasks.put(outfile) tasks.put(None) ms.message("Waiting for gzip compression to complete.") tasks.join() p.join() # done return 0
#============================================================================== # main #============================================================================== if __name__ == "__main__": argv = sys.argv argc = len(sys.argv) if argc < 3: message(usage()) sys.exit(1) argv = argv[1:len(argv)] ms.message("Loading refflat into object") ta = TranscriptomeAnnotation.TranscriptomeAnnotation() t0 = time() ta.load_refflat(argv[0]) ms.time_diff(t0) ms.message("building lookup table from object") t0 = time() lktable2 = ta.build_lktable() ms.time_diff(t0) outfile = "{}.t.sam".format(argv[1]) out_rnames = ta.names out_lengths = [ta.d[k].length for k in out_rnames]
def core(flist): dtargets = {} ltargets = [] target_idx = 0 dsamples = {} lsamples = [] sample_idx = 0 lhits = [] ms.message("Loading hits from {} files".format(len(flist))) t0 = time() for f in flist: sample_name = f.split(".")[0] lsamples.append(sample_name) # keep track of total reads dsamples[sample_name] = 0 with open(f, "r") as fin: for szl in fin: szl = szl.strip() if szl[0] == "#": r = re.search("^\#total\_reads\=([0-9]+)", szl) if r: dsamples[sample_name] = r.group(1) continue aln = szl.split("\t") if aln[0] not in dtargets: dtargets[aln[0]] = target_idx ltargets.append(aln[0]) target_idx += 1 if float(aln[1]) > 0: # record the hit as target index, sample index, hit count lhits.append([dtargets[aln[0]], sample_idx, aln[1]]) # increment sample index sample_idx += 1 ms.time_diff(t0) ## ## finished parsing files ## ms.message("Writing results to your base.") t0 = time() # create count output with open(COUNTS, "w") as fout: for l in lhits: sz = "\t".join(map(str, l)) fout.write(sz + "\n") # create samples file with open(SAMPLES, "w") as fout: for sid in lsamples: l = [sid, dsamples[sid]] fout.write("\t".join(map(str, l))) fout.write("\n") # create genes file with open(GENES, "w") as fout: for tid in ltargets: fout.write(tid + "\n") ms.time_diff(t0) return 0
def core(args): # variables # barcode dict to count reads per barcode bc = defaultdict(int) # barcode dict to track distinct umi and count them bc_umi = {} bc_keep = set() num_bc = 0 lnum = 0 lbc = "" lbc_last = "" # for output conversion file_queue = JoinableQueue() p = None pool = [] ## ## we need to index all barcodes and track umi per barcode. if these pickle ## files exist we can use them ## # we have to index # # parse the alignments. in this loop we only extract the cell barcode and the umi # plus record the file position offsets for barcodes. the dict that is built # is indexed by the barcodes and each element contains a list of file offsets for # reads that came from that barcode. we also get all of the distinct umis collected # per barcode in this loop in order to estimate the actual cell count before # writing all of the read files out to disk ms.message('Counting per-barcode reads and UMI') t0 = time() with ps.AlignmentFile(args.bam, "rb", check_header=False, check_sq=False) as fin: for aln in fin: lnum += 1 nparts = (aln.query_name).split(":") # barcode is first lbc = nparts[0] # umi is last umi = nparts[-1] bc[lbc] += 1 if lbc not in bc_umi: bc_umi[lbc] = {} if umi not in bc_umi[lbc]: bc_umi[lbc][umi] = 0 bc_umi[lbc][umi] += 1 if (lnum % 1000000) == 0: ms.progress_message("parsed {} reads".format(lnum)) # final progress message and total time of parsing ms.progress_message("parsed {} reads".format(lnum), last=True) sys.stderr.write("{} sec\n".format(time() - t0)) t0 = time() # # implement cell number detection per 10x. # here's what happens. you take the 'exp-cells' value (expected cells) # and multiply that by 0.01 to get an index. sort the barcodes and the # barcode umi counts in descending order and jump to the index you just # calculated and then take that index's umi count. scale that count # by 0.1. now you take as many cells, starting from the top of the umi # count sorted list, that have at least that many UMI. that's literally # how they do it. # t0 = time() ms.message("Determining cell count") # # write a file that will contain the cell id, umi count and read count # for each cell id. might be informative...who knows. with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout: bc_umi_counts = [] fout.write("barcode\tumi_count\tdistinct_reads\n") for lbc in bc.keys(): num_umi = len(bc_umi[lbc].keys()) bc_umi_counts.append([lbc, num_umi]) # write the cell id, distinct umi count and total read count to file fout.write("\t".join(map(str, [lbc, num_umi, bc[lbc]]))) fout.write("\n") # # sort by umi count in descending order and threshold bc_umi_counts.sort(key=lambda x: x[1], reverse=True) exp_cells = int(math.floor(args.exp_cells * 0.01 - 1)) num_reads = 0 num_umi = 0 num_bc = len(bc.keys()) i = 0 while True: # check if the current barcode is below threshold.. if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10: break # count umi and count distinct reads lbc = bc_umi_counts[i][0] # keep track of the barcodes that we will retain num_reads += bc[lbc] num_umi += len(bc_umi[lbc].keys()) i += 1 # # number of actual cells is 'i' because 'i' is incremented before # checking if the umi count passes the threshold. i-1 is the index # of the last cell we would accept num_cells = i # # now we can generate a summary for the detected cells with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout: fout.write("estimated_cells\t{}\n".format(num_cells)) fout.write("total_reads\t{}\n".format(num_reads)) fout.write("total_umi\t{}\n".format(num_umi)) fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells)) fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells)) # find the median barcode and corresponding read count if num_cells % 2 == 0: # even count median_idx = num_cells / 2 else: median_idx = num_cells / 2 + 1 median_lbc = bc_umi_counts[median_idx][0] fout.write("median_reads_per_cell\t{}\n".format(bc[median_lbc])) # # let user know what's up sys.stderr.write("{} sec\n".format(time() - t0)) sys.stderr.write("Total distinct barcodes: {}\n".format(num_bc)) sys.stderr.write("Cell number estimate: {}\n".format(num_cells)) if args.estimate_only: ms.message("Done.") return 0 if args.force_cells is not None: # change number of cells to either the total barcodes or the # value provided by the user, whichever is smaller num_cells = min([args.force_cells, num_bc]) sys.stderr.write("Forced cell output: {}\n".format(num_cells)) t0 = time() # make set of the barcodes that we will keep bc_keep = set() for i in range(num_cells): bc_keep.add(bc_umi_counts[i][0]) # # now we can dig back into the sorted bam file to export all of the individual cell lines # launch processes for gzip compression for i in range(args.p): p = Process(target=compress_reads, args=(file_queue, )) p.daemon = True p.start() pool.append(p) with ps.AlignmentFile(args.bam, "rb", check_header=False, check_sq=False) as fin: szout = "" bc_out = 0 for aln in fin: lnum += 1 nparts = (aln.query_name).split(":") # barcode is first lbc = nparts[0] # umi is last #umi = nparts[-1] if lbc != lbc_last: if lbc_last in bc_keep: # write buffered data to file... bc_out += 1 fname = "{}/{}.fastq".format(args.outpath, lbc_last) ms.progress_message("writing {} ({} of {})".format( fname, bc_out, num_cells)) with open(fname, "w") as fout: fout.write(szout) file_queue.put(fname) szout = "" # keep it? if random.random() < args.samplerate: # passes sampling limit if lbc in bc_keep: # convert line to fasta and append it to the output string szout += fastq_from_aln(aln) lbc_last = lbc if lbc_last in bc_keep: bc_out += 1 # write buffered data to file... fname = "{}/{}.fastq".format(args.outpath, lbc_last) ms.progress_message("writing {} ({} of {})".format( fname, bc_out, num_cells), last=True) with open(fname, "w") as fout: fout.write(szout) # put fname in the queue to be compressed file_queue.put(fname) for p in pool: file_queue.put(None) file_queue.join() for p in pool: p.join() ms.message("finished!") return 0
def core(args): annot = {} tid2gname = {} stub = hashlib.md5(args.fasta).hexdigest() fixed_case = "{}.ref.fa".format(stub) #gene_seq = "{}.gene.fa".format(stub) gene_shred = "{}.shred.fa".format(stub) gene_final = "{}.final.fa".format(stub) sub_chars = re.escape("[]{}\|/?!@#$%^&*()+=.") + "\s" fset = [fixed_case, "{}.fai".format(fixed_case), gene_shred, gene_final] ## # parse refflat annot, tid2gname = load_refflat(args.ref) ## # change case of input fasta ms.message("Converting all reference bases to uppercase") rres = fasta_fix_case(args.fasta, fixed_case) if args.just_alignment: args.bbmap = True if args.just_quantification: args.quantify = True if not args.just_alignment and not args.just_quantification: ## # creat worker process to deal with all of the shredding tasks = JoinableQueue() results = Queue() p = None pool = [] for i in range(args.p): p = Process(target=worker, args=( tasks, results, )) p.daemon = True p.start() pool.append(p) ## # get to work! i = 0 n = len(annot.keys()) ms.message("Starting main loop") for gid in annot.keys(): i += 1 if (i % 3) == 0: ms.progress_message("processing {}. {} of {}".format( gid, i, n)) gidHat = re.sub("[{}]".format(sub_chars), "", gid) # export all sequences belonging to this gene gene_seq = "{}.{}.fa".format(stub, gidHat) rres = samtools_faidx(fixed_case, gene_seq, annot[gid]) tasks.put([stub, gidHat]) for p in pool: tasks.put(None) ms.progress_message("Waiting for shredding to complete", last=True) tasks.join() for p in pool: p.join() ms.message("done") results.put(None) while True: fname = results.get() if fname is None: break if not isfile(fname): continue ms.message("Joining {}".format(fname)) rres = cat_result(fname, args.o) unlink(fname) # done! bam_out = re.sub("\.fasta$", ".bam", args.o) if args.bbmap: if not isfile(args.o): if not isfile("{}.gz".format(args.o)): ms.error_message( "Shredded reads file doest not exist, gzipped or not ({})". format(args.o)) return 1 else: # gunzip the reads file runcmd("gunzip {}.gz".format(args.o)) ms.message("Aligning shreds back against the reference") rres = bbmap(args.o, fixed_case, bam_out, args.t) if args.quantify: if not isfile(bam_out): ms.error_message( "Expected alignment file does not exist ({})".format(bam_out)) return 1 ms.message("Parsing alignments") pares = parse_alignments(bam_out, tid2gname) rres = process_pares(pares) tsvout = re.sub("\.bam$", ".tsv", bam_out) # output: # gene_name, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts ms.message("Writing mappability report") with open(tsvout, "w") as fout: fout.write("#read_length={}\n".format(args.l)) fout.write("\t".join([ "#gene_name", "total_reads", "min_mapp", "max_mapp", "mean_mapp", "most_similar", "all_targets", "target_counts" ]) + "\n") for gname in sorted(rres.keys()): total_reads = rres[gname]['total_reads'] if total_reads == 0: ms.warning_message("{} had zero reads".format(gname)) min_mapp = 1 max_mapp = 1 mean_mapp = 1 most_similar = "na" all_genes = "na" all_gene_counts = "na" if len(rres[gname]['target']) > 0: mapp = [] for n in rres[gname]['target_count']: if total_reads > 0: mapp.append(1 - n * 1.0 / total_reads) else: mapp.append(0) min_mapp = min(mapp) max_mapp = max(mapp) mean_mapp = np.mean(mapp) for i in range(len(mapp)): if mapp[i] == min_mapp: most_similar = rres[gname]['target'][i] all_genes = ",".join(rres[gname]['target']) all_gene_counts = ",".join( map(str, rres[gname]['target_count'])) lout = [ gname, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts ] fout.write("\t".join(map(str, lout)) + "\n") if not args.just_quantification and not args.just_alignment: if args.z: if isfile("{}.gz".format(args.o)): unlink("{}.gz".format(args.o)) cmd = "gzip {}".format(args.o) runcmd(cmd) # # clear out temp files for fname in fset: if isfile(fname): unlink(fname) return 0
def core(args): sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""] linen = 0 rnum = 0 tmpname = hashlib.md5(args.fastq).hexdigest() samout = "@HD\tVN:1.0\tSO:unsorted\n" # open input file if re.search("\.gz$", args.fastq): fin = gzip.open(args.fastq, "r") else: fin = open(args.fastq, "r") # open SAM output fout = open("{}.sam".format(tmpname), "w") fout.write(samout) t0 = time() for szl in fin: linen += 1 if linen == 1: # read name rname = szl.strip() elif linen == 2: # read seq = szl.strip() elif linen == 4: qual = szl.strip() # get the sam alignment read for writing sam = list(sam0) sam[SAM_QNAME] = re.sub("^\@", "", rname) sam[SAM_SEQ] = seq sam[SAM_QUAL] = qual # write line out to file fout.write("\t".join(sam)) fout.write("\n") # reset line counter linen = 0 rnum += 1 if rnum > 0 and (rnum % 1000000) == 0: ms.progress_message("parsed {} reads".format(rnum)) ms.progress_message("parsed {} reads".format(rnum), last=True) ms.time_diff(t0) fout.close() fin.close() ms.message("Converting to BAM") t0 = time() t = args.t if t == 0: t = cpu_count() / 2 elif t > cpu_cout(): t = cpu_count() cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname) rres = utils.runcmd(cmd) if rres[0] != 0: ms.error_message("Failed to create BAM file!") return 1 ms.time_diff(t0) unlink("{}.sam".format(tmpname)) return 0