def main(args): if not isfile(args.batch_file): ms.error_message("Input batch file does not exist ({})".format( args.batch_file)) return 1 flist = [] err_flag = False ms.message("Checking batch file") t0 = time() # load the batch file and confirm all of the input files exist. with open(args.batch_file, "r") as fin: for szl in fin: fname = szl.strip() if not isfile(fname): err_flag = True ms.error_message("{} does not exist".format(fname)) else: flist.append(fname) ms.time_diff(t0) if err_flag: return 1 rres = core(flist) return 1
def main(args): ## ## check input files ## if not isfile(args.bam): ms.error_message("Input file does not exist") return 1 if args.o is not None: if isfile(args.o): ms.message("Output file exists. Overwriting.") if args.z: if not re.search("\.gz$", args.o): args.o += ".gz" else: if re.search("\.gz$", args.o): # auto enable gzip if the output file name has .gz at the end args.z = True rres = core(args) return 0
def main(args): if not isfile(args.gtf): ms.error_message("Input file does not exist: {}".format(args.gtf)) return 1 rres = core(args) return rres
def main(args): ## ## check input files ## if not isfile(args.fastq): ms.error_message("Input file does not exist") return 1 rres = core(args) return 0
def main(args): # variables # check input files if not isfile(args.fasta): ms.error_message("Input FASTA file does not exist") return 1 if not isfile(args.ref): ms.error_message("Input refFlat annotation does not exist") return 1 if args.o is None: args.o = "{}.shreds.{}.fasta".format(drop_file_ext(args.fasta), args.l) else: tmp = "{}.shreds.{}.fasta".format(args.o, args.l) args.o = tmp if not args.just_alignment and not args.just_quantification: if isfile(args.o): unlink(args.o) if isfile("{}.gz".format(args.o)): unlink("{}.gz".format(args.o)) if isfile("shred.log"): unlink("shred.log") if isfile("dedupe.log"): unlink("dedupe.log") if args.quantify: args.bbmap = True # good to go! if args.s: rres = core2(args) else: rres = core(args) return rres
def main(args): # check input file if not os.path.isfile(args.bam): ms.error_message("Input file does not exist") return 1 if not re.search("\.bam$", args.bam): ms.error_message("Input file should be a BAM file.") return 1 ## ## check for output folder ## if not os.path.isdir(args.outpath): ms.message("Creating output folder {}".format(args.outpath)) os.mkdir(args.outpath) rres = core(args) return rres
def main(args): # check input file if not os.path.isfile(args.gtf): ms.error_message("Input file is missing") return 1 t0 = time() ms.message("Loading annotation") dtid, dtid2gname, all_attrs, tid_order = load(args.gtf) ms.time_diff(t0) tmp = all_attrs.difference(set(["transcript_id", "gene_id", "gene_name"])) all_attrs = sorted(list(tmp)) # header sys.stdout.write( "chrom\tdb\tfeature\tstart\tend\tscore\tstrand\tframe\ttranscript_id\tgene_id\tgene_name\t" ) sys.stdout.write("\t".join(all_attrs)) sys.stdout.write("\n") for tid in tid_order: for gg in dtid[tid]: # print each row lout = gg.parts[0:8] lout.append(gg.transcript_id()) lout.append(gg.gene_id()) lout.append(gg.gene_name()) for aid in all_attrs: if aid in gg.attr: lout.append(gg.attr[aid]) else: lout.append("na") sys.stdout.write("\t".join(lout) + "\n") return 0
def main(args): # variables left = [] right = [] # figure out what's up if args.a is not None or args.b is not None: if not (args.a is not None and args.b is not None): ms.error_message( "You must specify both -a and -b if running a single sample.") return 1 if args.a is not None and args.b is not None: # single sample left = [args.a] right = [args.b] elif args.f is not None: if not isfile(args.f): ms.error_message("Batch file does not exist") return 1 # we have a batch file with open(args.f, "r") as fin: for szl in fin: aln = szl.strip().split("\t") left.append(aln[0]) right.append(aln[1]) if len(left) < 1 or len(right) < 1: ms.error_message("No samples to process!") return 1 rres = core(left, right, args) return rres
if argc < 2: ms.message(usage()) sys.exit(1) argv = argv[1:len(argv)] stat_list = [] # t0 = time() # lktable2 = ta.build_lktable() # ms.time_diff(t0) for i in range(len(argv)): if not isfile(argv[i]): ms.error_message("file does not exist ({})".format(argv[i])) continue mstat = Stats() ms.message("parsing {}".format(argv[i])) t0 = time() with ps.AlignmentFile(argv[i]) as fin: rnames = ps_tools.get_alignmentfile_rnames(fin) numhit = 0 for aln in fin: mstat.lines += 1 if (aln.flag & 0x100) == 0: mstat.reads += 1
def main(args): # variables bam_file = False bc = {} bc_readcount = defaultdict(int) bc_umi = {} num_bc = 0 offset = 0 lnum = 0 sz_umi = "" umi_file = "" # string to capture file summary table that's used with kallisto pseudo -b sz_table = "#id\tumiFile\tcellFile\n" dumi = None bam_flag = False sam_header = "" quant_mode = False ## ## check for output folder ## if not os.path.isdir(args.outpath): ms.message("Creating output folder {}".format(args.outpath)) os.mkdir(args.outpath) file_queue = JoinableQueue() p = None pool = [] if args.R is not None: if not os.path.isfile(args.R): ms.error_message( "Supplied annotation file does not exist ({})".format(args.R)) return 1 else: quant_mode = True ## ## figure out if we have a bam as input. if so we have to convert it to sam for indexing ## if re.search("\.bam$", args.fin): # send the sam file into the output folder sam_name = args.outpath + "/" + os.path.basename( re.sub("\.bam$", ".sam", args.fin)) if not os.path.isfile(sam_name): # need to convert alignments to sam bam_flag = True cmd = "samtools view -h {} > {}".format(args.fin, sam_name) t0 = time() message("Temporarily converting BAM to SAM format") rres = runcmd(cmd) if rres[0] != 0: sys.stderr.write( "Error: samtools exited with non-zero exit status!\n") return 1 sys.stderr.write("{} sec\n".format(time() - t0)) else: sam_name = args.fin ## ## we need to index all barcodes and track umi per barcode. if these pickle ## files exist we can use them ## bc_pkl = args.outpath + "/" + BC_PICKLE bc_umi_pkl = args.outpath + "/" + BC_UMI_PICKLE bc_readcount_pkl = args.outpath + "/" + BC_READCOUNT sam_header_pkl = args.outpath + "/sam_header.pkl" if os.path.isfile(bc_pkl) and os.path.isfile( bc_umi_pkl) and os.path.isfile(bc_readcount_pkl): ## # load indexes from pickles ms.message( "Loading existing barcode and umi indexes from output folder") t0 = time() bc = pickle.load(open(bc_pkl, "rb")) bc_umi = pickle.load(open(bc_umi_pkl, "rb")) bc_readcount = pickle.load(open(bc_readcount_pkl, "rb")) sam_header = pickle.load(open(sam_header_pkl, "rb")) num_bc = len(bc.keys()) ms.time_diff(t0) else: # we have to index # # parse the alignments. in this loop we only extract the cell barcode and the umi # plus record the file position offsets for barcodes. the dict that is built # is indexed by the barcodes and each element contains a list of file offsets for # reads that came from that barcode. we also get all of the distinct umis collected # per barcode in this loop in order to estimate the actual cell count before # writing all of the read files out to disk message('Indexing cell barcodes from alignments and counting raw UMI.') t0 = time() with open(sam_name, "r") as fin: for szl in fin: if szl[0] == "@": # append header line to header string sam_header += szl offset += len(szl) continue # count lines and produce progress message so we know this thing is # running lnum += 1 if lnum % 1000000 == 0: progress_message("read {} lines".format(lnum)) # fetch the cell barcode from the read name line_bc = parse_barcode(szl) if line_bc not in bc: # first encounter with this barcode num_bc += 1 # init a list for this barcode's line offsets within this sam file bc[line_bc] = [] # init a dict for the barcode to track umis bc_umi[line_bc] = defaultdict(int) # append line offset to this barcode's list bc[line_bc].append(offset) # get the umi and add it to this barcode's dict IF this is not a # secondary alignment aln = szl.split("\t") if (int(aln[1]) & 0x100) == 0: # not a secondary alignment. track it. umi = parse_umi(szl) bc_umi[line_bc][umi] += 1 if ((int(aln[1]) & 0x4) == 0) and ((int(aln[1]) & 0x100) == 0): # this read is aligned and is a primary alignment so we can count this one # into this barcode's aligned read count bc_readcount[line_bc] += 1 # update offset to the next line offset += len(szl) # final progress message and total time of parsing progress_message("read {} lines".format(lnum), last=True) sys.stderr.write("{} sec\n".format(time() - t0)) t0 = time() if not args.no_pickles: ms.message("saving indexes to disk") pickle.dump(bc, open(bc_pkl, "wb")) pickle.dump(bc_umi, open(bc_umi_pkl, "wb")) pickle.dump(bc_readcount, open(bc_readcount_pkl, "wb")) pickle.dump(sam_header, open(sam_header_pkl, "wb")) ms.time_diff(t0) # # implement cell number detection per 10x. # here's what happens. you take the 'exp-cells' value (expected cells) # and multiply that by 0.01 to get an index. sort the barcodes and the # barcode umi counts in descending order and jump to the index you just # calculated and then take that index's umi count. scale that count # by 0.1. now you take as many cells, starting from the top of the umi # count sorted list, that have at least that many UMI. that's literally # how they do it. # t0 = time() message("Determining cell count") # # write a file that will contain the cell id, umi count and read count # for each cell id. might be informative...who knows. with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout: bc_umi_counts = [] fout.write("barcode\tumi_count\tdistinct_reads\talignments\n") for lbc in bc.keys(): num_umi = len(bc_umi[lbc].keys()) bc_umi_counts.append([lbc, num_umi]) # write the cell id, distinct umi count and total read count to file fout.write("\t".join( map(str, [lbc, num_umi, bc_readcount[lbc], len(bc[lbc])]))) fout.write("\n") # # sort by umi count in descending order and threshold bc_umi_counts.sort(key=lambda x: x[1], reverse=True) exp_cells = int(math.floor(args.exp_cells * 0.01 - 1)) num_reads = 0 num_umi = 0 i = 0 while True: if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10: break # count umi and count distinct reads lbc = bc_umi_counts[i][0] num_reads += bc_readcount[lbc] num_umi += len(bc_umi[lbc].keys()) i += 1 # # number of actual cells is 'i' because 'i' is incremented before # checking if the umi count passes the threshold. i-1 is the index # of the last cell we would accept num_cells = i # # now we can generate a summary for the detected cells with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout: fout.write("estimated_cells\t{}\n".format(num_cells)) fout.write("total_reads\t{}\n".format(num_reads)) fout.write("total_umi\t{}\n".format(num_umi)) fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells)) fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells)) # find the median barcode and corresponding read count if num_cells % 2 == 0: # even count median_idx = num_cells / 2 else: median_idx = num_cells / 2 + 1 median_lbc = bc_umi_counts[median_idx][0] fout.write("median_reads_per_cell\t{}\n".format( bc_readcount[median_lbc])) # # let user know what's up sys.stderr.write("{} sec\n".format(time() - t0)) sys.stderr.write("Total distinct barcodes: {}\n".format(num_bc)) sys.stderr.write("Cell number estimate: {}\n".format(num_cells)) if args.estimate_only: if bam_flag: # input was BAM so we can dump the converted file. just putting in # some logic to be certain that the original file is not deleted. if os.path.isfile(args.fin) and os.path.isfile(sam_name) and ( sam_name != args.fin): os.unlink(sam_name) ms.message("Done.") return 0 if args.force_cells is not None: # change number of cells to either the total barcodes or the # value provided by the user, whichever is smaller num_cells = min([args.force_cells, num_bc]) sys.stderr.write("Forced cell output: {}\n".format(num_cells)) t0 = time() message( "Parsing individual detected cell alignments out to individual files") if quant_mode: # start quantificaion child processes for parsed sam files for i in range(args.p): p = Process(target=quantification_worker, args=( file_queue, args, )) p.daemon = True p.start() pool.append(p) else: # start child process for sam to bam conversion for i in range(args.p): p = Process(target=compress_reads, args=(file_queue, )) p.daemon = True p.start() pool.append(p) fin = open(sam_name, "r") # write individual cell files i = 0 sz_umi = "" while i < num_cells: # get barcode lbc = bc_umi_counts[i][0] # start output strings szout = sam_header #sz_umi = "" # setup output file name cell_file = "{}/{}.sam".format(args.outpath, lbc) #umi_file = "{}.umi".format(lbc) # update user on progress progress_message("Writing {} - {}/{} ({} reads)".format( cell_file, i + 1, num_cells, len(bc[lbc]))) if args.samplerate < 1 and args.samplerate > 0: ## # to subsample we have to run through all read offsets for this cell and index the reads # then take a subset of them to write out to disk. I have to do this because the # alignment file contains secondary alignments which have to be collapsed by # read name prior to the subsampling. read_index = defaultdict(list) for offset in bc[lbc]: fin.seek(offset) aln = fin.readline().strip().split("\t") rname = aln[0] read_index[rname].append(offset) # # now by looping through distinct reads we can dump out only those that are at the specified rate for rname in read_index.keys(): if random.random() > args.samplerate: continue # dump this read for offset in read_index[rname]: fin.seek(offset) szout += fin.readline() else: # loop through line offsets for this barcode and append lines to the output string for offset in bc[lbc]: fin.seek(offset) szout += fin.readline() # write the file with open(cell_file, "w") as fout: fout.write(szout) # send the file off for bam compression file_queue.put(cell_file) i += 1 fin.close() sys.stderr.write("\n") sys.stderr.write("{} sec\n".format(time() - t0)) if bam_flag: # input was BAM so we can dump the converted file. just putting in # some logic to be certain that the original file is not deleted. if os.path.isfile(args.fin) and os.path.isfile(sam_name) and ( sam_name != args.fin): os.unlink(sam_name) sys.stderr.write("Waiting for child process to finish compressing files\n") for p in pool: file_queue.put(None) file_queue.join() for p in pool: p.join() message("finished!") return 0
def core(left, right, args): ## ## create queue and child process for compressing the fastq files ## bc_len = args.barcode_length umi_len = args.umi_length tasks = JoinableQueue() p = Process(target=gz_worker, args=(tasks, )) p.daemon = True p.start() ## ## input files are paired from the sequencer so we just have to read through them ## and write them back out for i in range(len(left)): m1 = left[i] m2 = right[i] # pick apart the name of the second file to build the output name base = basename(m2) path = dirname(m2) base_parts = base.split(".") stub = base_parts[0] outfile = "{}_prepped.fastq".format(stub) if outfile == m1 or outfile == m2: ms.error_message( "Output file path matches input file path. WTF? {}".format( outfile)) sys.exit(1) if isfile(outfile): ms.warning_message( "output file exists. overwriting. {}".format(outfile)) fout = open(outfile, "w") try: ms.message("Processing {}".format(m1)) with open_reads(m1) as fin1, open_reads(m2) as fin2: nidx = 0 nreads = 0 lread = [] for szl2 in fin2: if (nreads % 1000000) == 0: ms.progress_message("Parsed {} reads".format(nreads)) nidx += 1 if nidx == 1: # read name line rname = szl2.strip().split() rname = rname[0] # read two lines from the barcode file szl1 = fin1.readline() szl1 = fin1.readline().strip() # this is the barcode so we can pick it apart. I'm going to put the cell barcode # at the front of the read so that I can maybe leverage samtools sort to sort # barcodes together for me prior to parsing cells out rname_tmp = re.sub("^\@", "", rname) rname = "@{}:{}".format(szl1[0:bc_len], rname_tmp) # 20180226 # moved the cell barcode to the front of the read name so we only # need to write the umi at the end and not both #rname += ":{}:{}".format(szl1[0:16], szl1[16:len(szl1)]) rname += ":{}".format(szl1[bc_len:(bc_len + umi_len)]) lread.append(rname + "\n") # read the remaining lines for this read from the barcodes file szl1 = fin1.readline() szl1 = fin1.readline() elif nidx < 4: lread.append(szl2) if nidx == 4: # finished with read lread.append(szl2) fout.write("".join(lread)) nidx = 0 lread = [] nreads += 1 ms.progress_message("Parsed {} reads".format(nreads), last=True) except: fout.close() sys.exit(1) fout.close() #ms.message("compressing {}".format(outfile)) #system("gzip -f {}".format(outfile)) tasks.put(outfile) tasks.put(None) ms.message("Waiting for gzip compression to complete.") tasks.join() p.join() # done return 0
def main(args): # variables dkey_index = defaultdict(list) ltable = [] lheader = [] idx = 0 col_idx = args.c - 1 num_col = [] if not isfile(args.source_file): ms.error_message("Input file does not exist") return 1 ## ## open input file and parse it in with open(args.source_file, "r") as fin: if args.H: szl = fin.readline() lheader = szl.strip().split("\t") for szl in fin: # put each row of the file into a list. also keep track of which rows belong to # which keys in a dict aln = szl.strip().split("\t") ltable.append(aln) num_col.append(len(aln)) dkey_index[aln[col_idx]].append(idx) idx += 1 ## # check column count for consistency if min(num_col) != max(num_col): ms.error_message( "Column count is not consistent throughout the input file!") return 1 num_col = min(num_col) ## ## build collapsed version lout = [] for kid in sorted(dkey_index.keys()): tmp = [] num_row = len(dkey_index[kid]) for i in range(num_col): if i == col_idx: continue if num_row > 1: # this key has multiple rows so we have to collapse the value # from the current column within each row into a single string ltmp = [] for j in dkey_index[kid]: ltmp.append(ltable[j][i]) # check to see if this field has more than a single level set_tmp = set(ltmp) if len(set_tmp) == 1: # single level tmp.append(ltmp[0]) else: # more than one level so keep all of them tmp.append(",".join(ltmp)) else: # no collapse, single row tmp.append(ltable[dkey_index[kid][0]][i]) lout.append("\t".join([kid] + tmp) + "\n") if args.H: hout = [lheader[col_idx]] for i in range(num_col): if i != col_idx: hout.append(lheader[i]) sys.stdout.write("\t".join(hout)) sys.stdout.write("\n") for i in range(len(lout)): sys.stdout.write(lout[i]) return 0
def core(args): annot = {} tid2gname = {} stub = hashlib.md5(args.fasta).hexdigest() fixed_case = "{}.ref.fa".format(stub) #gene_seq = "{}.gene.fa".format(stub) gene_shred = "{}.shred.fa".format(stub) gene_final = "{}.final.fa".format(stub) sub_chars = re.escape("[]{}\|/?!@#$%^&*()+=.") + "\s" fset = [fixed_case, "{}.fai".format(fixed_case), gene_shred, gene_final] ## # parse refflat annot, tid2gname = load_refflat(args.ref) ## # change case of input fasta ms.message("Converting all reference bases to uppercase") rres = fasta_fix_case(args.fasta, fixed_case) if args.just_alignment: args.bbmap = True if args.just_quantification: args.quantify = True if not args.just_alignment and not args.just_quantification: ## # creat worker process to deal with all of the shredding tasks = JoinableQueue() results = Queue() p = None pool = [] for i in range(args.p): p = Process(target=worker, args=( tasks, results, )) p.daemon = True p.start() pool.append(p) ## # get to work! i = 0 n = len(annot.keys()) ms.message("Starting main loop") for gid in annot.keys(): i += 1 if (i % 3) == 0: ms.progress_message("processing {}. {} of {}".format( gid, i, n)) gidHat = re.sub("[{}]".format(sub_chars), "", gid) # export all sequences belonging to this gene gene_seq = "{}.{}.fa".format(stub, gidHat) rres = samtools_faidx(fixed_case, gene_seq, annot[gid]) tasks.put([stub, gidHat]) for p in pool: tasks.put(None) ms.progress_message("Waiting for shredding to complete", last=True) tasks.join() for p in pool: p.join() ms.message("done") results.put(None) while True: fname = results.get() if fname is None: break if not isfile(fname): continue ms.message("Joining {}".format(fname)) rres = cat_result(fname, args.o) unlink(fname) # done! bam_out = re.sub("\.fasta$", ".bam", args.o) if args.bbmap: if not isfile(args.o): if not isfile("{}.gz".format(args.o)): ms.error_message( "Shredded reads file doest not exist, gzipped or not ({})". format(args.o)) return 1 else: # gunzip the reads file runcmd("gunzip {}.gz".format(args.o)) ms.message("Aligning shreds back against the reference") rres = bbmap(args.o, fixed_case, bam_out, args.t) if args.quantify: if not isfile(bam_out): ms.error_message( "Expected alignment file does not exist ({})".format(bam_out)) return 1 ms.message("Parsing alignments") pares = parse_alignments(bam_out, tid2gname) rres = process_pares(pares) tsvout = re.sub("\.bam$", ".tsv", bam_out) # output: # gene_name, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts ms.message("Writing mappability report") with open(tsvout, "w") as fout: fout.write("#read_length={}\n".format(args.l)) fout.write("\t".join([ "#gene_name", "total_reads", "min_mapp", "max_mapp", "mean_mapp", "most_similar", "all_targets", "target_counts" ]) + "\n") for gname in sorted(rres.keys()): total_reads = rres[gname]['total_reads'] if total_reads == 0: ms.warning_message("{} had zero reads".format(gname)) min_mapp = 1 max_mapp = 1 mean_mapp = 1 most_similar = "na" all_genes = "na" all_gene_counts = "na" if len(rres[gname]['target']) > 0: mapp = [] for n in rres[gname]['target_count']: if total_reads > 0: mapp.append(1 - n * 1.0 / total_reads) else: mapp.append(0) min_mapp = min(mapp) max_mapp = max(mapp) mean_mapp = np.mean(mapp) for i in range(len(mapp)): if mapp[i] == min_mapp: most_similar = rres[gname]['target'][i] all_genes = ",".join(rres[gname]['target']) all_gene_counts = ",".join( map(str, rres[gname]['target_count'])) lout = [ gname, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts ] fout.write("\t".join(map(str, lout)) + "\n") if not args.just_quantification and not args.just_alignment: if args.z: if isfile("{}.gz".format(args.o)): unlink("{}.gz".format(args.o)) cmd = "gzip {}".format(args.o) runcmd(cmd) # # clear out temp files for fname in fset: if isfile(fname): unlink(fname) return 0
def core(args): sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""] linen = 0 rnum = 0 tmpname = hashlib.md5(args.fastq).hexdigest() samout = "@HD\tVN:1.0\tSO:unsorted\n" # open input file if re.search("\.gz$", args.fastq): fin = gzip.open(args.fastq, "r") else: fin = open(args.fastq, "r") # open SAM output fout = open("{}.sam".format(tmpname), "w") fout.write(samout) t0 = time() for szl in fin: linen += 1 if linen == 1: # read name rname = szl.strip() elif linen == 2: # read seq = szl.strip() elif linen == 4: qual = szl.strip() # get the sam alignment read for writing sam = list(sam0) sam[SAM_QNAME] = re.sub("^\@", "", rname) sam[SAM_SEQ] = seq sam[SAM_QUAL] = qual # write line out to file fout.write("\t".join(sam)) fout.write("\n") # reset line counter linen = 0 rnum += 1 if rnum > 0 and (rnum % 1000000) == 0: ms.progress_message("parsed {} reads".format(rnum)) ms.progress_message("parsed {} reads".format(rnum), last=True) ms.time_diff(t0) fout.close() fin.close() ms.message("Converting to BAM") t0 = time() t = args.t if t == 0: t = cpu_count() / 2 elif t > cpu_cout(): t = cpu_count() cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname) rres = utils.runcmd(cmd) if rres[0] != 0: ms.error_message("Failed to create BAM file!") return 1 ms.time_diff(t0) unlink("{}.sam".format(tmpname)) return 0