Esempio n. 1
0
def main(args):

    ##
    ## check input files
    ##

    if not isfile(args.bam):
        ms.error_message("Input file does not exist")
        return 1

    if args.o is not None:

        if isfile(args.o):
            ms.message("Output file exists. Overwriting.")

        if args.z:
            if not re.search("\.gz$", args.o):
                args.o += ".gz"

        else:
            if re.search("\.gz$", args.o):
                # auto enable gzip if the output file name has .gz at the end
                args.z = True

    rres = core(args)

    return 0
Esempio n. 2
0
def main(args):

    if not isfile(args.batch_file):
        ms.error_message("Input batch file does not exist ({})".format(
            args.batch_file))
        return 1

    flist = []
    err_flag = False

    ms.message("Checking batch file")
    t0 = time()

    # load the batch file and confirm all of the input files exist.
    with open(args.batch_file, "r") as fin:
        for szl in fin:
            fname = szl.strip()
            if not isfile(fname):
                err_flag = True
                ms.error_message("{} does not exist".format(fname))
            else:
                flist.append(fname)

    ms.time_diff(t0)

    if err_flag:
        return 1

    rres = core(flist)

    return 1
Esempio n. 3
0
def core(args):

    dtid = {}

    ms.message("Parsing transcript entries in GTF")
    with open(args.gtf, "r") as fin:
        # parse the GTF to find the 'transcript' rows and gather info

        for szl in fin:
            if szl[0] == "#":
                continue

            grow = gtf_parseline(szl)

            if grow['type'] == "transcript":
                tid = grow['attrs']['transcript_id']
                dtid[tid] = grow['attrs']

    # finished. now we can pass back through and parse the exon rows out
    szout = ""
    ms.message("Incorporating transcript annotations into exon fields")
    with open(args.gtf, "r") as fin:

        for szl in fin:
            if szl[0] == "#":
                szout += szl
                continue

            grow = gtf_parseline(szl)
            if grow['type'] == "exon":
                tid = grow['attrs']['transcript_id']
                if tid in dtid:
                    # combine annotation information
                    rres = merge_annot(grow['attrs'], dtid[tid])
                    grow['attrs'] = copy.deepcopy(rres)

                szout += gtf_row_tostring(grow)
                szout += "\n"

    if args.o is not None:
        with open(args.o, "w") as fout:
            fout.write(szout)
    else:
        sys.stdout.write(szout)

    return 0
Esempio n. 4
0
def main(args):

    # check input file
    if not os.path.isfile(args.bam):
        ms.error_message("Input file does not exist")
        return 1

    if not re.search("\.bam$", args.bam):
        ms.error_message("Input file should be a BAM file.")
        return 1

    ##
    ## check for output folder
    ##
    if not os.path.isdir(args.outpath):
        ms.message("Creating output folder {}".format(args.outpath))
        os.mkdir(args.outpath)

    rres = core(args)

    return rres
Esempio n. 5
0
def main(args):

    # check input file
    if not os.path.isfile(args.gtf):
        ms.error_message("Input file is missing")
        return 1

    t0 = time()
    ms.message("Loading annotation")
    dtid, dtid2gname, all_attrs, tid_order = load(args.gtf)
    ms.time_diff(t0)

    tmp = all_attrs.difference(set(["transcript_id", "gene_id", "gene_name"]))
    all_attrs = sorted(list(tmp))

    # header
    sys.stdout.write(
        "chrom\tdb\tfeature\tstart\tend\tscore\tstrand\tframe\ttranscript_id\tgene_id\tgene_name\t"
    )
    sys.stdout.write("\t".join(all_attrs))
    sys.stdout.write("\n")

    for tid in tid_order:
        for gg in dtid[tid]:
            # print each row
            lout = gg.parts[0:8]
            lout.append(gg.transcript_id())
            lout.append(gg.gene_id())
            lout.append(gg.gene_name())

            for aid in all_attrs:
                if aid in gg.attr:
                    lout.append(gg.attr[aid])
                else:
                    lout.append("na")

            sys.stdout.write("\t".join(lout) + "\n")

    return 0
Esempio n. 6
0
def usage():
    sz = "usage: bam-stats.py <sam/bam>"
    return sz


#==============================================================================
# main
#==============================================================================

if __name__ == "__main__":

    argv = sys.argv
    argc = len(sys.argv)

    if argc < 2:
        ms.message(usage())
        sys.exit(1)

    argv = argv[1:len(argv)]

    stat_list = []

    #	t0 = time()
    #	lktable2 = ta.build_lktable()
    #	ms.time_diff(t0)

    for i in range(len(argv)):
        if not isfile(argv[i]):
            ms.error_message("file does not exist ({})".format(argv[i]))
            continue
Esempio n. 7
0
def main(args):

    # variables
    bam_file = False

    bc = {}
    bc_readcount = defaultdict(int)
    bc_umi = {}
    num_bc = 0
    offset = 0
    lnum = 0
    sz_umi = ""
    umi_file = ""
    # string to capture file summary table that's used with kallisto pseudo -b
    sz_table = "#id\tumiFile\tcellFile\n"
    dumi = None
    bam_flag = False
    sam_header = ""
    quant_mode = False

    ##
    ## check for output folder
    ##
    if not os.path.isdir(args.outpath):
        ms.message("Creating output folder {}".format(args.outpath))
        os.mkdir(args.outpath)

    file_queue = JoinableQueue()
    p = None
    pool = []

    if args.R is not None:
        if not os.path.isfile(args.R):
            ms.error_message(
                "Supplied annotation file does not exist ({})".format(args.R))
            return 1
        else:
            quant_mode = True

    ##
    ## figure out if we have a bam as input. if so we have to convert it to sam for indexing
    ##
    if re.search("\.bam$", args.fin):
        # send the sam file into the output folder
        sam_name = args.outpath + "/" + os.path.basename(
            re.sub("\.bam$", ".sam", args.fin))

        if not os.path.isfile(sam_name):
            # need to convert alignments to sam
            bam_flag = True
            cmd = "samtools view -h {} > {}".format(args.fin, sam_name)
            t0 = time()
            message("Temporarily converting BAM to SAM format")
            rres = runcmd(cmd)
            if rres[0] != 0:
                sys.stderr.write(
                    "Error: samtools exited with non-zero exit status!\n")
                return 1

            sys.stderr.write("{} sec\n".format(time() - t0))

    else:
        sam_name = args.fin

    ##
    ## we need to index all barcodes and track umi per barcode. if these pickle
    ## files exist we can use them
    ##

    bc_pkl = args.outpath + "/" + BC_PICKLE
    bc_umi_pkl = args.outpath + "/" + BC_UMI_PICKLE
    bc_readcount_pkl = args.outpath + "/" + BC_READCOUNT
    sam_header_pkl = args.outpath + "/sam_header.pkl"

    if os.path.isfile(bc_pkl) and os.path.isfile(
            bc_umi_pkl) and os.path.isfile(bc_readcount_pkl):
        ##
        # load indexes from pickles
        ms.message(
            "Loading existing barcode and umi indexes from output folder")
        t0 = time()
        bc = pickle.load(open(bc_pkl, "rb"))
        bc_umi = pickle.load(open(bc_umi_pkl, "rb"))
        bc_readcount = pickle.load(open(bc_readcount_pkl, "rb"))
        sam_header = pickle.load(open(sam_header_pkl, "rb"))
        num_bc = len(bc.keys())
        ms.time_diff(t0)

    else:
        # we have to index

        #
        # parse the alignments. in this loop we only extract the cell barcode and the umi
        # plus record the file position offsets for barcodes. the dict that is built
        # is indexed by the barcodes and each element contains a list of file offsets for
        # reads that came from that barcode. we also get all of the distinct umis collected
        # per barcode in this loop in order to estimate the actual cell count before
        # writing all of the read files out to disk
        message('Indexing cell barcodes from alignments and counting raw UMI.')
        t0 = time()
        with open(sam_name, "r") as fin:

            for szl in fin:
                if szl[0] == "@":
                    # append header line to header string
                    sam_header += szl
                    offset += len(szl)
                    continue

                # count lines and produce progress message so we know this thing is
                # running
                lnum += 1
                if lnum % 1000000 == 0:
                    progress_message("read {} lines".format(lnum))

                # fetch the cell barcode from the read name
                line_bc = parse_barcode(szl)

                if line_bc not in bc:
                    # first encounter with this barcode
                    num_bc += 1
                    # init a list for this barcode's line offsets within this sam file
                    bc[line_bc] = []
                    # init a dict for the barcode to track umis
                    bc_umi[line_bc] = defaultdict(int)

                # append line offset to this barcode's list
                bc[line_bc].append(offset)
                # get the umi and add it to this barcode's dict IF this is not a
                # secondary alignment
                aln = szl.split("\t")

                if (int(aln[1]) & 0x100) == 0:
                    # not a secondary alignment. track it.
                    umi = parse_umi(szl)
                    bc_umi[line_bc][umi] += 1

                if ((int(aln[1]) & 0x4) == 0) and ((int(aln[1]) & 0x100) == 0):
                    # this read is aligned and is a primary alignment so we can count this one
                    # into this barcode's aligned read count
                    bc_readcount[line_bc] += 1

                # update offset to the next line
                offset += len(szl)

        # final progress message and total time of parsing
        progress_message("read {} lines".format(lnum), last=True)
        sys.stderr.write("{} sec\n".format(time() - t0))

        t0 = time()

        if not args.no_pickles:
            ms.message("saving indexes to disk")
            pickle.dump(bc, open(bc_pkl, "wb"))
            pickle.dump(bc_umi, open(bc_umi_pkl, "wb"))
            pickle.dump(bc_readcount, open(bc_readcount_pkl, "wb"))
            pickle.dump(sam_header, open(sam_header_pkl, "wb"))
            ms.time_diff(t0)

    #
    # implement cell number detection per 10x.
    # here's what happens. you take the 'exp-cells' value (expected cells)
    # and multiply that by 0.01 to get an index. sort the barcodes and the
    # barcode umi counts in descending order and jump to the index you just
    # calculated and then take that index's umi count. scale that count
    # by 0.1. now you take as many cells, starting from the top of the umi
    # count sorted list, that have at least that many UMI.  that's literally
    # how they do it.
    #

    t0 = time()
    message("Determining cell count")

    #
    # write a file that will contain the cell id, umi count and read count
    # for each cell id. might be informative...who knows.
    with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout:
        bc_umi_counts = []

        fout.write("barcode\tumi_count\tdistinct_reads\talignments\n")

        for lbc in bc.keys():
            num_umi = len(bc_umi[lbc].keys())
            bc_umi_counts.append([lbc, num_umi])
            # write the cell id, distinct umi count and total read count to file
            fout.write("\t".join(
                map(str, [lbc, num_umi, bc_readcount[lbc],
                          len(bc[lbc])])))
            fout.write("\n")

    #
    # sort by umi count in descending order and threshold
    bc_umi_counts.sort(key=lambda x: x[1], reverse=True)
    exp_cells = int(math.floor(args.exp_cells * 0.01 - 1))

    num_reads = 0
    num_umi = 0

    i = 0
    while True:
        if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10:
            break

        # count umi and count distinct reads
        lbc = bc_umi_counts[i][0]
        num_reads += bc_readcount[lbc]
        num_umi += len(bc_umi[lbc].keys())

        i += 1

    #
    # number of actual cells is 'i' because 'i' is incremented before
    # checking if the umi count passes the threshold. i-1 is the index
    # of the last cell we would accept
    num_cells = i

    #
    # now we can generate a summary for the detected cells
    with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout:

        fout.write("estimated_cells\t{}\n".format(num_cells))
        fout.write("total_reads\t{}\n".format(num_reads))
        fout.write("total_umi\t{}\n".format(num_umi))
        fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells))
        fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells))

        # find the median barcode and corresponding read count
        if num_cells % 2 == 0:
            # even count
            median_idx = num_cells / 2
        else:
            median_idx = num_cells / 2 + 1

        median_lbc = bc_umi_counts[median_idx][0]
        fout.write("median_reads_per_cell\t{}\n".format(
            bc_readcount[median_lbc]))

    #
    # let user know what's up
    sys.stderr.write("{} sec\n".format(time() - t0))
    sys.stderr.write("Total distinct barcodes:  {}\n".format(num_bc))
    sys.stderr.write("Cell number estimate:     {}\n".format(num_cells))

    if args.estimate_only:
        if bam_flag:
            # input was BAM so we can dump the converted file. just putting in
            # some logic to be certain that the original file is not deleted.
            if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                    sam_name != args.fin):
                os.unlink(sam_name)

        ms.message("Done.")
        return 0

    if args.force_cells is not None:
        # change number of cells to either the total barcodes or the
        # value provided by the user, whichever is smaller
        num_cells = min([args.force_cells, num_bc])
        sys.stderr.write("Forced cell output:       {}\n".format(num_cells))

    t0 = time()

    message(
        "Parsing individual detected cell alignments out to individual files")

    if quant_mode:
        # start quantificaion child processes for parsed sam files
        for i in range(args.p):
            p = Process(target=quantification_worker,
                        args=(
                            file_queue,
                            args,
                        ))
            p.daemon = True
            p.start()
            pool.append(p)

    else:
        # start child process for sam to bam conversion
        for i in range(args.p):
            p = Process(target=compress_reads, args=(file_queue, ))
            p.daemon = True
            p.start()
            pool.append(p)

    fin = open(sam_name, "r")

    # write individual cell files
    i = 0
    sz_umi = ""
    while i < num_cells:
        # get barcode
        lbc = bc_umi_counts[i][0]
        # start output strings
        szout = sam_header
        #sz_umi = ""
        # setup output file name
        cell_file = "{}/{}.sam".format(args.outpath, lbc)
        #umi_file = "{}.umi".format(lbc)

        # update user on progress
        progress_message("Writing {} - {}/{} ({} reads)".format(
            cell_file, i + 1, num_cells, len(bc[lbc])))

        if args.samplerate < 1 and args.samplerate > 0:

            ##
            # to subsample we have to run through all read offsets for this cell and index the reads
            # then take a subset of them to write out to disk. I have to do this because the
            # alignment file contains secondary alignments which have to be collapsed by
            # read name prior to the subsampling.
            read_index = defaultdict(list)
            for offset in bc[lbc]:
                fin.seek(offset)
                aln = fin.readline().strip().split("\t")
                rname = aln[0]
                read_index[rname].append(offset)

            #
            # now by looping through distinct reads we can dump out only those that are at the specified rate
            for rname in read_index.keys():
                if random.random() > args.samplerate:
                    continue

                # dump this read
                for offset in read_index[rname]:
                    fin.seek(offset)
                    szout += fin.readline()

        else:

            # loop through line offsets for this barcode and append lines to the output string
            for offset in bc[lbc]:
                fin.seek(offset)
                szout += fin.readline()

        # write the file
        with open(cell_file, "w") as fout:
            fout.write(szout)

        # send the file off for bam compression
        file_queue.put(cell_file)

        i += 1

    fin.close()

    sys.stderr.write("\n")
    sys.stderr.write("{} sec\n".format(time() - t0))

    if bam_flag:
        # input was BAM so we can dump the converted file. just putting in
        # some logic to be certain that the original file is not deleted.
        if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                sam_name != args.fin):
            os.unlink(sam_name)

    sys.stderr.write("Waiting for child process to finish compressing files\n")

    for p in pool:
        file_queue.put(None)
    file_queue.join()

    for p in pool:
        p.join()

    message("finished!")

    return 0
Esempio n. 8
0
def core(left, right, args):

    ##
    ## create queue and child process for compressing the fastq files
    ##

    bc_len = args.barcode_length
    umi_len = args.umi_length

    tasks = JoinableQueue()
    p = Process(target=gz_worker, args=(tasks, ))
    p.daemon = True
    p.start()

    ##
    ## input files are paired from the sequencer so we just have to read through them
    ## and write them back out

    for i in range(len(left)):

        m1 = left[i]
        m2 = right[i]

        # pick apart the name of the second file to build the output name
        base = basename(m2)
        path = dirname(m2)

        base_parts = base.split(".")
        stub = base_parts[0]

        outfile = "{}_prepped.fastq".format(stub)

        if outfile == m1 or outfile == m2:
            ms.error_message(
                "Output file path matches input file path. WTF? {}".format(
                    outfile))
            sys.exit(1)

        if isfile(outfile):
            ms.warning_message(
                "output file exists. overwriting. {}".format(outfile))

        fout = open(outfile, "w")

        try:
            ms.message("Processing {}".format(m1))
            with open_reads(m1) as fin1, open_reads(m2) as fin2:

                nidx = 0
                nreads = 0
                lread = []
                for szl2 in fin2:

                    if (nreads % 1000000) == 0:
                        ms.progress_message("Parsed {} reads".format(nreads))

                    nidx += 1
                    if nidx == 1:
                        # read name line
                        rname = szl2.strip().split()
                        rname = rname[0]
                        # read two lines from the barcode file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline().strip()
                        # this is the barcode so we can pick it apart. I'm going to put the cell barcode
                        # at the front of the read so that I can maybe leverage samtools sort to sort
                        # barcodes together for me prior to parsing cells out
                        rname_tmp = re.sub("^\@", "", rname)
                        rname = "@{}:{}".format(szl1[0:bc_len], rname_tmp)
                        # 20180226
                        # moved the cell barcode to the front of the read name so we only
                        # need to write the umi at the end and not both
                        #rname += ":{}:{}".format(szl1[0:16], szl1[16:len(szl1)])
                        rname += ":{}".format(szl1[bc_len:(bc_len + umi_len)])
                        lread.append(rname + "\n")
                        # read the remaining lines for this read from the barcodes file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline()
                    elif nidx < 4:
                        lread.append(szl2)

                    if nidx == 4:
                        # finished with read
                        lread.append(szl2)
                        fout.write("".join(lread))
                        nidx = 0
                        lread = []
                        nreads += 1

            ms.progress_message("Parsed {} reads".format(nreads), last=True)

        except:
            fout.close()
            sys.exit(1)

        fout.close()

        #ms.message("compressing {}".format(outfile))
        #system("gzip -f {}".format(outfile))
        tasks.put(outfile)

    tasks.put(None)
    ms.message("Waiting for gzip compression to complete.")
    tasks.join()
    p.join()

    # done

    return 0
Esempio n. 9
0
#==============================================================================
# main
#==============================================================================

if __name__ == "__main__":

    argv = sys.argv
    argc = len(sys.argv)

    if argc < 3:
        message(usage())
        sys.exit(1)

    argv = argv[1:len(argv)]

    ms.message("Loading refflat into object")
    ta = TranscriptomeAnnotation.TranscriptomeAnnotation()
    t0 = time()
    ta.load_refflat(argv[0])
    ms.time_diff(t0)

    ms.message("building lookup table from object")
    t0 = time()
    lktable2 = ta.build_lktable()
    ms.time_diff(t0)

    outfile = "{}.t.sam".format(argv[1])

    out_rnames = ta.names
    out_lengths = [ta.d[k].length for k in out_rnames]
Esempio n. 10
0
def core(flist):

    dtargets = {}
    ltargets = []
    target_idx = 0
    dsamples = {}
    lsamples = []
    sample_idx = 0
    lhits = []

    ms.message("Loading hits from {} files".format(len(flist)))
    t0 = time()
    for f in flist:
        sample_name = f.split(".")[0]
        lsamples.append(sample_name)
        # keep track of total reads
        dsamples[sample_name] = 0

        with open(f, "r") as fin:
            for szl in fin:
                szl = szl.strip()
                if szl[0] == "#":
                    r = re.search("^\#total\_reads\=([0-9]+)", szl)
                    if r:
                        dsamples[sample_name] = r.group(1)

                    continue

                aln = szl.split("\t")

                if aln[0] not in dtargets:
                    dtargets[aln[0]] = target_idx
                    ltargets.append(aln[0])
                    target_idx += 1

                if float(aln[1]) > 0:
                    # record the hit as target index, sample index, hit count
                    lhits.append([dtargets[aln[0]], sample_idx, aln[1]])

        # increment sample index
        sample_idx += 1

    ms.time_diff(t0)

    ##
    ## finished parsing files
    ##

    ms.message("Writing results to your base.")
    t0 = time()

    # create count output
    with open(COUNTS, "w") as fout:
        for l in lhits:
            sz = "\t".join(map(str, l))
            fout.write(sz + "\n")

    # create samples file
    with open(SAMPLES, "w") as fout:
        for sid in lsamples:
            l = [sid, dsamples[sid]]
            fout.write("\t".join(map(str, l)))
            fout.write("\n")

    # create genes file
    with open(GENES, "w") as fout:
        for tid in ltargets:
            fout.write(tid + "\n")

    ms.time_diff(t0)

    return 0
Esempio n. 11
0
def core(args):

    # variables

    # barcode dict to count reads per barcode
    bc = defaultdict(int)
    # barcode dict to track distinct umi and count them
    bc_umi = {}
    bc_keep = set()
    num_bc = 0
    lnum = 0
    lbc = ""
    lbc_last = ""

    # for output conversion
    file_queue = JoinableQueue()
    p = None
    pool = []

    ##
    ## we need to index all barcodes and track umi per barcode. if these pickle
    ## files exist we can use them
    ##

    # we have to index

    #
    # parse the alignments. in this loop we only extract the cell barcode and the umi
    # plus record the file position offsets for barcodes. the dict that is built
    # is indexed by the barcodes and each element contains a list of file offsets for
    # reads that came from that barcode. we also get all of the distinct umis collected
    # per barcode in this loop in order to estimate the actual cell count before
    # writing all of the read files out to disk
    ms.message('Counting per-barcode reads and UMI')
    t0 = time()
    with ps.AlignmentFile(args.bam, "rb", check_header=False,
                          check_sq=False) as fin:

        for aln in fin:
            lnum += 1

            nparts = (aln.query_name).split(":")
            # barcode is first
            lbc = nparts[0]
            # umi is last
            umi = nparts[-1]

            bc[lbc] += 1

            if lbc not in bc_umi:
                bc_umi[lbc] = {}

            if umi not in bc_umi[lbc]:
                bc_umi[lbc][umi] = 0

            bc_umi[lbc][umi] += 1

            if (lnum % 1000000) == 0:
                ms.progress_message("parsed {} reads".format(lnum))

    # final progress message and total time of parsing
    ms.progress_message("parsed {} reads".format(lnum), last=True)
    sys.stderr.write("{} sec\n".format(time() - t0))

    t0 = time()

    #
    # implement cell number detection per 10x.
    # here's what happens. you take the 'exp-cells' value (expected cells)
    # and multiply that by 0.01 to get an index. sort the barcodes and the
    # barcode umi counts in descending order and jump to the index you just
    # calculated and then take that index's umi count. scale that count
    # by 0.1. now you take as many cells, starting from the top of the umi
    # count sorted list, that have at least that many UMI.  that's literally
    # how they do it.
    #

    t0 = time()
    ms.message("Determining cell count")

    #
    # write a file that will contain the cell id, umi count and read count
    # for each cell id. might be informative...who knows.
    with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout:
        bc_umi_counts = []

        fout.write("barcode\tumi_count\tdistinct_reads\n")

        for lbc in bc.keys():
            num_umi = len(bc_umi[lbc].keys())
            bc_umi_counts.append([lbc, num_umi])
            # write the cell id, distinct umi count and total read count to file
            fout.write("\t".join(map(str, [lbc, num_umi, bc[lbc]])))
            fout.write("\n")

    #
    # sort by umi count in descending order and threshold
    bc_umi_counts.sort(key=lambda x: x[1], reverse=True)
    exp_cells = int(math.floor(args.exp_cells * 0.01 - 1))

    num_reads = 0
    num_umi = 0
    num_bc = len(bc.keys())

    i = 0
    while True:
        # check if the current barcode is below threshold..
        if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10:
            break

        # count umi and count distinct reads
        lbc = bc_umi_counts[i][0]
        # keep track of the barcodes that we will retain
        num_reads += bc[lbc]
        num_umi += len(bc_umi[lbc].keys())

        i += 1

    #
    # number of actual cells is 'i' because 'i' is incremented before
    # checking if the umi count passes the threshold. i-1 is the index
    # of the last cell we would accept
    num_cells = i

    #
    # now we can generate a summary for the detected cells
    with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout:

        fout.write("estimated_cells\t{}\n".format(num_cells))
        fout.write("total_reads\t{}\n".format(num_reads))
        fout.write("total_umi\t{}\n".format(num_umi))
        fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells))
        fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells))

        # find the median barcode and corresponding read count
        if num_cells % 2 == 0:
            # even count
            median_idx = num_cells / 2
        else:
            median_idx = num_cells / 2 + 1

        median_lbc = bc_umi_counts[median_idx][0]
        fout.write("median_reads_per_cell\t{}\n".format(bc[median_lbc]))

    #
    # let user know what's up
    sys.stderr.write("{} sec\n".format(time() - t0))
    sys.stderr.write("Total distinct barcodes:  {}\n".format(num_bc))
    sys.stderr.write("Cell number estimate:     {}\n".format(num_cells))

    if args.estimate_only:
        ms.message("Done.")
        return 0

    if args.force_cells is not None:
        # change number of cells to either the total barcodes or the
        # value provided by the user, whichever is smaller
        num_cells = min([args.force_cells, num_bc])
        sys.stderr.write("Forced cell output:       {}\n".format(num_cells))

    t0 = time()

    # make set of the barcodes that we will keep
    bc_keep = set()
    for i in range(num_cells):
        bc_keep.add(bc_umi_counts[i][0])

    #
    # now we can dig back into the sorted bam file to export all of the individual cell lines

    # launch processes for gzip compression
    for i in range(args.p):
        p = Process(target=compress_reads, args=(file_queue, ))
        p.daemon = True
        p.start()
        pool.append(p)

    with ps.AlignmentFile(args.bam, "rb", check_header=False,
                          check_sq=False) as fin:

        szout = ""
        bc_out = 0
        for aln in fin:
            lnum += 1

            nparts = (aln.query_name).split(":")
            # barcode is first
            lbc = nparts[0]
            # umi is last
            #umi = nparts[-1]

            if lbc != lbc_last:
                if lbc_last in bc_keep:

                    # write buffered data to file...
                    bc_out += 1
                    fname = "{}/{}.fastq".format(args.outpath, lbc_last)
                    ms.progress_message("writing {} ({} of {})".format(
                        fname, bc_out, num_cells))
                    with open(fname, "w") as fout:
                        fout.write(szout)

                    file_queue.put(fname)

                szout = ""

            # keep it?
            if random.random() < args.samplerate:
                # passes sampling limit
                if lbc in bc_keep:
                    # convert line to fasta and append it to the output string
                    szout += fastq_from_aln(aln)

            lbc_last = lbc

    if lbc_last in bc_keep:
        bc_out += 1
        # write buffered data to file...
        fname = "{}/{}.fastq".format(args.outpath, lbc_last)
        ms.progress_message("writing {} ({} of {})".format(
            fname, bc_out, num_cells),
                            last=True)
        with open(fname, "w") as fout:
            fout.write(szout)

        # put fname in the queue to be compressed
        file_queue.put(fname)

    for p in pool:
        file_queue.put(None)

    file_queue.join()

    for p in pool:
        p.join()

    ms.message("finished!")

    return 0
Esempio n. 12
0
def core(args):

    annot = {}
    tid2gname = {}

    stub = hashlib.md5(args.fasta).hexdigest()

    fixed_case = "{}.ref.fa".format(stub)
    #gene_seq = "{}.gene.fa".format(stub)
    gene_shred = "{}.shred.fa".format(stub)
    gene_final = "{}.final.fa".format(stub)
    sub_chars = re.escape("[]{}\|/?!@#$%^&*()+=.") + "\s"

    fset = [fixed_case, "{}.fai".format(fixed_case), gene_shred, gene_final]

    ##
    # parse refflat
    annot, tid2gname = load_refflat(args.ref)

    ##
    # change case of input fasta
    ms.message("Converting all reference bases to uppercase")
    rres = fasta_fix_case(args.fasta, fixed_case)

    if args.just_alignment:
        args.bbmap = True

    if args.just_quantification:
        args.quantify = True

    if not args.just_alignment and not args.just_quantification:

        ##
        # creat worker process to deal with all of the shredding
        tasks = JoinableQueue()
        results = Queue()
        p = None
        pool = []

        for i in range(args.p):
            p = Process(target=worker, args=(
                tasks,
                results,
            ))
            p.daemon = True
            p.start()
            pool.append(p)

        ##
        # get to work!
        i = 0
        n = len(annot.keys())
        ms.message("Starting main loop")
        for gid in annot.keys():
            i += 1
            if (i % 3) == 0:
                ms.progress_message("processing {}. {} of {}".format(
                    gid, i, n))

            gidHat = re.sub("[{}]".format(sub_chars), "", gid)

            # export all sequences belonging to this gene
            gene_seq = "{}.{}.fa".format(stub, gidHat)

            rres = samtools_faidx(fixed_case, gene_seq, annot[gid])
            tasks.put([stub, gidHat])

        for p in pool:
            tasks.put(None)

        ms.progress_message("Waiting for shredding to complete", last=True)

        tasks.join()
        for p in pool:
            p.join()

        ms.message("done")

        results.put(None)

        while True:
            fname = results.get()
            if fname is None:
                break

            if not isfile(fname):
                continue

            ms.message("Joining {}".format(fname))

            rres = cat_result(fname, args.o)
            unlink(fname)

    # done!

    bam_out = re.sub("\.fasta$", ".bam", args.o)

    if args.bbmap:
        if not isfile(args.o):
            if not isfile("{}.gz".format(args.o)):
                ms.error_message(
                    "Shredded reads file doest not exist, gzipped or not ({})".
                    format(args.o))
                return 1
            else:
                # gunzip the reads file
                runcmd("gunzip {}.gz".format(args.o))

        ms.message("Aligning shreds back against the reference")

        rres = bbmap(args.o, fixed_case, bam_out, args.t)

    if args.quantify:
        if not isfile(bam_out):
            ms.error_message(
                "Expected alignment file does not exist ({})".format(bam_out))
            return 1

        ms.message("Parsing alignments")
        pares = parse_alignments(bam_out, tid2gname)
        rres = process_pares(pares)

        tsvout = re.sub("\.bam$", ".tsv", bam_out)

        # output:
        # gene_name, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts
        ms.message("Writing mappability report")
        with open(tsvout, "w") as fout:

            fout.write("#read_length={}\n".format(args.l))
            fout.write("\t".join([
                "#gene_name", "total_reads", "min_mapp", "max_mapp",
                "mean_mapp", "most_similar", "all_targets", "target_counts"
            ]) + "\n")

            for gname in sorted(rres.keys()):

                total_reads = rres[gname]['total_reads']
                if total_reads == 0:
                    ms.warning_message("{} had zero reads".format(gname))
                min_mapp = 1
                max_mapp = 1
                mean_mapp = 1
                most_similar = "na"
                all_genes = "na"
                all_gene_counts = "na"

                if len(rres[gname]['target']) > 0:
                    mapp = []
                    for n in rres[gname]['target_count']:
                        if total_reads > 0:
                            mapp.append(1 - n * 1.0 / total_reads)
                        else:
                            mapp.append(0)

                    min_mapp = min(mapp)
                    max_mapp = max(mapp)
                    mean_mapp = np.mean(mapp)

                    for i in range(len(mapp)):
                        if mapp[i] == min_mapp:
                            most_similar = rres[gname]['target'][i]

                    all_genes = ",".join(rres[gname]['target'])
                    all_gene_counts = ",".join(
                        map(str, rres[gname]['target_count']))

                lout = [
                    gname, total_reads, min_mapp, max_mapp, mean_mapp,
                    most_similar, all_genes, all_gene_counts
                ]
                fout.write("\t".join(map(str, lout)) + "\n")

    if not args.just_quantification and not args.just_alignment:
        if args.z:
            if isfile("{}.gz".format(args.o)):
                unlink("{}.gz".format(args.o))

            cmd = "gzip {}".format(args.o)
            runcmd(cmd)

    #
    # clear out temp files
    for fname in fset:
        if isfile(fname):
            unlink(fname)

    return 0
Esempio n. 13
0
def core(args):

    sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""]
    linen = 0
    rnum = 0
    tmpname = hashlib.md5(args.fastq).hexdigest()
    samout = "@HD\tVN:1.0\tSO:unsorted\n"

    # open input file
    if re.search("\.gz$", args.fastq):
        fin = gzip.open(args.fastq, "r")
    else:
        fin = open(args.fastq, "r")

    # open SAM output
    fout = open("{}.sam".format(tmpname), "w")
    fout.write(samout)

    t0 = time()

    for szl in fin:
        linen += 1
        if linen == 1:
            # read name
            rname = szl.strip()
        elif linen == 2:
            # read
            seq = szl.strip()
        elif linen == 4:
            qual = szl.strip()
            # get the sam alignment read for writing
            sam = list(sam0)
            sam[SAM_QNAME] = re.sub("^\@", "", rname)
            sam[SAM_SEQ] = seq
            sam[SAM_QUAL] = qual
            # write line out to file
            fout.write("\t".join(sam))
            fout.write("\n")
            # reset line counter
            linen = 0
            rnum += 1

        if rnum > 0 and (rnum % 1000000) == 0:
            ms.progress_message("parsed {} reads".format(rnum))

    ms.progress_message("parsed {} reads".format(rnum), last=True)

    ms.time_diff(t0)

    fout.close()
    fin.close()

    ms.message("Converting to BAM")
    t0 = time()

    t = args.t
    if t == 0:
        t = cpu_count() / 2
    elif t > cpu_cout():
        t = cpu_count()

    cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname)
    rres = utils.runcmd(cmd)
    if rres[0] != 0:
        ms.error_message("Failed to create BAM file!")
        return 1

    ms.time_diff(t0)

    unlink("{}.sam".format(tmpname))

    return 0