Beispiel #1
0
def main(args):

    if not isfile(args.batch_file):
        ms.error_message("Input batch file does not exist ({})".format(
            args.batch_file))
        return 1

    flist = []
    err_flag = False

    ms.message("Checking batch file")
    t0 = time()

    # load the batch file and confirm all of the input files exist.
    with open(args.batch_file, "r") as fin:
        for szl in fin:
            fname = szl.strip()
            if not isfile(fname):
                err_flag = True
                ms.error_message("{} does not exist".format(fname))
            else:
                flist.append(fname)

    ms.time_diff(t0)

    if err_flag:
        return 1

    rres = core(flist)

    return 1
Beispiel #2
0
def main(args):

    # check input file
    if not os.path.isfile(args.gtf):
        ms.error_message("Input file is missing")
        return 1

    t0 = time()
    ms.message("Loading annotation")
    dtid, dtid2gname, all_attrs, tid_order = load(args.gtf)
    ms.time_diff(t0)

    tmp = all_attrs.difference(set(["transcript_id", "gene_id", "gene_name"]))
    all_attrs = sorted(list(tmp))

    # header
    sys.stdout.write(
        "chrom\tdb\tfeature\tstart\tend\tscore\tstrand\tframe\ttranscript_id\tgene_id\tgene_name\t"
    )
    sys.stdout.write("\t".join(all_attrs))
    sys.stdout.write("\n")

    for tid in tid_order:
        for gg in dtid[tid]:
            # print each row
            lout = gg.parts[0:8]
            lout.append(gg.transcript_id())
            lout.append(gg.gene_id())
            lout.append(gg.gene_name())

            for aid in all_attrs:
                if aid in gg.attr:
                    lout.append(gg.attr[aid])
                else:
                    lout.append("na")

            sys.stdout.write("\t".join(lout) + "\n")

    return 0
Beispiel #3
0
                        if aln.flag & 0x100:
                            mstat.second_mate_secondary += 1
                        else:
                            mstat.second_mate += 1

                        if aln.flag & 0x10:
                            mstat.second_mate_second_strand += 1
                        else:
                            mstat.second_mate_first_strand += 1

            stat_list.append(mstat)
            ms.progress_message("parsed {} lines".format(mstat.lines),
                                last=True)

        ms.time_diff(t0)

    print "\t".join([
        "file", "lines", "reads", "primary_aligned", "secondary_alignments",
        "first_strand", "second_strand", "first_mate", "second_mate",
        "first_mate_secondary", "second_mate_secondary", "fmfs", "fmss",
        "smfs", "smss"
    ])
    for i in range(len(stat_list)):
        m = stat_list[i]
        lout = [
            argv[i], m.lines, m.reads, m.primary_aligned,
            m.secondary_alignments, m.first_strand, m.second_strand,
            m.first_mate, m.second_mate, m.first_mate_secondary,
            m.second_mate_secondary, m.first_mate_first_strand,
            m.first_mate_second_strand, m.second_mate_first_strand,
Beispiel #4
0
def main(args):

    # variables
    bam_file = False

    bc = {}
    bc_readcount = defaultdict(int)
    bc_umi = {}
    num_bc = 0
    offset = 0
    lnum = 0
    sz_umi = ""
    umi_file = ""
    # string to capture file summary table that's used with kallisto pseudo -b
    sz_table = "#id\tumiFile\tcellFile\n"
    dumi = None
    bam_flag = False
    sam_header = ""
    quant_mode = False

    ##
    ## check for output folder
    ##
    if not os.path.isdir(args.outpath):
        ms.message("Creating output folder {}".format(args.outpath))
        os.mkdir(args.outpath)

    file_queue = JoinableQueue()
    p = None
    pool = []

    if args.R is not None:
        if not os.path.isfile(args.R):
            ms.error_message(
                "Supplied annotation file does not exist ({})".format(args.R))
            return 1
        else:
            quant_mode = True

    ##
    ## figure out if we have a bam as input. if so we have to convert it to sam for indexing
    ##
    if re.search("\.bam$", args.fin):
        # send the sam file into the output folder
        sam_name = args.outpath + "/" + os.path.basename(
            re.sub("\.bam$", ".sam", args.fin))

        if not os.path.isfile(sam_name):
            # need to convert alignments to sam
            bam_flag = True
            cmd = "samtools view -h {} > {}".format(args.fin, sam_name)
            t0 = time()
            message("Temporarily converting BAM to SAM format")
            rres = runcmd(cmd)
            if rres[0] != 0:
                sys.stderr.write(
                    "Error: samtools exited with non-zero exit status!\n")
                return 1

            sys.stderr.write("{} sec\n".format(time() - t0))

    else:
        sam_name = args.fin

    ##
    ## we need to index all barcodes and track umi per barcode. if these pickle
    ## files exist we can use them
    ##

    bc_pkl = args.outpath + "/" + BC_PICKLE
    bc_umi_pkl = args.outpath + "/" + BC_UMI_PICKLE
    bc_readcount_pkl = args.outpath + "/" + BC_READCOUNT
    sam_header_pkl = args.outpath + "/sam_header.pkl"

    if os.path.isfile(bc_pkl) and os.path.isfile(
            bc_umi_pkl) and os.path.isfile(bc_readcount_pkl):
        ##
        # load indexes from pickles
        ms.message(
            "Loading existing barcode and umi indexes from output folder")
        t0 = time()
        bc = pickle.load(open(bc_pkl, "rb"))
        bc_umi = pickle.load(open(bc_umi_pkl, "rb"))
        bc_readcount = pickle.load(open(bc_readcount_pkl, "rb"))
        sam_header = pickle.load(open(sam_header_pkl, "rb"))
        num_bc = len(bc.keys())
        ms.time_diff(t0)

    else:
        # we have to index

        #
        # parse the alignments. in this loop we only extract the cell barcode and the umi
        # plus record the file position offsets for barcodes. the dict that is built
        # is indexed by the barcodes and each element contains a list of file offsets for
        # reads that came from that barcode. we also get all of the distinct umis collected
        # per barcode in this loop in order to estimate the actual cell count before
        # writing all of the read files out to disk
        message('Indexing cell barcodes from alignments and counting raw UMI.')
        t0 = time()
        with open(sam_name, "r") as fin:

            for szl in fin:
                if szl[0] == "@":
                    # append header line to header string
                    sam_header += szl
                    offset += len(szl)
                    continue

                # count lines and produce progress message so we know this thing is
                # running
                lnum += 1
                if lnum % 1000000 == 0:
                    progress_message("read {} lines".format(lnum))

                # fetch the cell barcode from the read name
                line_bc = parse_barcode(szl)

                if line_bc not in bc:
                    # first encounter with this barcode
                    num_bc += 1
                    # init a list for this barcode's line offsets within this sam file
                    bc[line_bc] = []
                    # init a dict for the barcode to track umis
                    bc_umi[line_bc] = defaultdict(int)

                # append line offset to this barcode's list
                bc[line_bc].append(offset)
                # get the umi and add it to this barcode's dict IF this is not a
                # secondary alignment
                aln = szl.split("\t")

                if (int(aln[1]) & 0x100) == 0:
                    # not a secondary alignment. track it.
                    umi = parse_umi(szl)
                    bc_umi[line_bc][umi] += 1

                if ((int(aln[1]) & 0x4) == 0) and ((int(aln[1]) & 0x100) == 0):
                    # this read is aligned and is a primary alignment so we can count this one
                    # into this barcode's aligned read count
                    bc_readcount[line_bc] += 1

                # update offset to the next line
                offset += len(szl)

        # final progress message and total time of parsing
        progress_message("read {} lines".format(lnum), last=True)
        sys.stderr.write("{} sec\n".format(time() - t0))

        t0 = time()

        if not args.no_pickles:
            ms.message("saving indexes to disk")
            pickle.dump(bc, open(bc_pkl, "wb"))
            pickle.dump(bc_umi, open(bc_umi_pkl, "wb"))
            pickle.dump(bc_readcount, open(bc_readcount_pkl, "wb"))
            pickle.dump(sam_header, open(sam_header_pkl, "wb"))
            ms.time_diff(t0)

    #
    # implement cell number detection per 10x.
    # here's what happens. you take the 'exp-cells' value (expected cells)
    # and multiply that by 0.01 to get an index. sort the barcodes and the
    # barcode umi counts in descending order and jump to the index you just
    # calculated and then take that index's umi count. scale that count
    # by 0.1. now you take as many cells, starting from the top of the umi
    # count sorted list, that have at least that many UMI.  that's literally
    # how they do it.
    #

    t0 = time()
    message("Determining cell count")

    #
    # write a file that will contain the cell id, umi count and read count
    # for each cell id. might be informative...who knows.
    with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout:
        bc_umi_counts = []

        fout.write("barcode\tumi_count\tdistinct_reads\talignments\n")

        for lbc in bc.keys():
            num_umi = len(bc_umi[lbc].keys())
            bc_umi_counts.append([lbc, num_umi])
            # write the cell id, distinct umi count and total read count to file
            fout.write("\t".join(
                map(str, [lbc, num_umi, bc_readcount[lbc],
                          len(bc[lbc])])))
            fout.write("\n")

    #
    # sort by umi count in descending order and threshold
    bc_umi_counts.sort(key=lambda x: x[1], reverse=True)
    exp_cells = int(math.floor(args.exp_cells * 0.01 - 1))

    num_reads = 0
    num_umi = 0

    i = 0
    while True:
        if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10:
            break

        # count umi and count distinct reads
        lbc = bc_umi_counts[i][0]
        num_reads += bc_readcount[lbc]
        num_umi += len(bc_umi[lbc].keys())

        i += 1

    #
    # number of actual cells is 'i' because 'i' is incremented before
    # checking if the umi count passes the threshold. i-1 is the index
    # of the last cell we would accept
    num_cells = i

    #
    # now we can generate a summary for the detected cells
    with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout:

        fout.write("estimated_cells\t{}\n".format(num_cells))
        fout.write("total_reads\t{}\n".format(num_reads))
        fout.write("total_umi\t{}\n".format(num_umi))
        fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells))
        fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells))

        # find the median barcode and corresponding read count
        if num_cells % 2 == 0:
            # even count
            median_idx = num_cells / 2
        else:
            median_idx = num_cells / 2 + 1

        median_lbc = bc_umi_counts[median_idx][0]
        fout.write("median_reads_per_cell\t{}\n".format(
            bc_readcount[median_lbc]))

    #
    # let user know what's up
    sys.stderr.write("{} sec\n".format(time() - t0))
    sys.stderr.write("Total distinct barcodes:  {}\n".format(num_bc))
    sys.stderr.write("Cell number estimate:     {}\n".format(num_cells))

    if args.estimate_only:
        if bam_flag:
            # input was BAM so we can dump the converted file. just putting in
            # some logic to be certain that the original file is not deleted.
            if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                    sam_name != args.fin):
                os.unlink(sam_name)

        ms.message("Done.")
        return 0

    if args.force_cells is not None:
        # change number of cells to either the total barcodes or the
        # value provided by the user, whichever is smaller
        num_cells = min([args.force_cells, num_bc])
        sys.stderr.write("Forced cell output:       {}\n".format(num_cells))

    t0 = time()

    message(
        "Parsing individual detected cell alignments out to individual files")

    if quant_mode:
        # start quantificaion child processes for parsed sam files
        for i in range(args.p):
            p = Process(target=quantification_worker,
                        args=(
                            file_queue,
                            args,
                        ))
            p.daemon = True
            p.start()
            pool.append(p)

    else:
        # start child process for sam to bam conversion
        for i in range(args.p):
            p = Process(target=compress_reads, args=(file_queue, ))
            p.daemon = True
            p.start()
            pool.append(p)

    fin = open(sam_name, "r")

    # write individual cell files
    i = 0
    sz_umi = ""
    while i < num_cells:
        # get barcode
        lbc = bc_umi_counts[i][0]
        # start output strings
        szout = sam_header
        #sz_umi = ""
        # setup output file name
        cell_file = "{}/{}.sam".format(args.outpath, lbc)
        #umi_file = "{}.umi".format(lbc)

        # update user on progress
        progress_message("Writing {} - {}/{} ({} reads)".format(
            cell_file, i + 1, num_cells, len(bc[lbc])))

        if args.samplerate < 1 and args.samplerate > 0:

            ##
            # to subsample we have to run through all read offsets for this cell and index the reads
            # then take a subset of them to write out to disk. I have to do this because the
            # alignment file contains secondary alignments which have to be collapsed by
            # read name prior to the subsampling.
            read_index = defaultdict(list)
            for offset in bc[lbc]:
                fin.seek(offset)
                aln = fin.readline().strip().split("\t")
                rname = aln[0]
                read_index[rname].append(offset)

            #
            # now by looping through distinct reads we can dump out only those that are at the specified rate
            for rname in read_index.keys():
                if random.random() > args.samplerate:
                    continue

                # dump this read
                for offset in read_index[rname]:
                    fin.seek(offset)
                    szout += fin.readline()

        else:

            # loop through line offsets for this barcode and append lines to the output string
            for offset in bc[lbc]:
                fin.seek(offset)
                szout += fin.readline()

        # write the file
        with open(cell_file, "w") as fout:
            fout.write(szout)

        # send the file off for bam compression
        file_queue.put(cell_file)

        i += 1

    fin.close()

    sys.stderr.write("\n")
    sys.stderr.write("{} sec\n".format(time() - t0))

    if bam_flag:
        # input was BAM so we can dump the converted file. just putting in
        # some logic to be certain that the original file is not deleted.
        if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                sam_name != args.fin):
            os.unlink(sam_name)

    sys.stderr.write("Waiting for child process to finish compressing files\n")

    for p in pool:
        file_queue.put(None)
    file_queue.join()

    for p in pool:
        p.join()

    message("finished!")

    return 0
Beispiel #5
0
def core(flist):

    dtargets = {}
    ltargets = []
    target_idx = 0
    dsamples = {}
    lsamples = []
    sample_idx = 0
    lhits = []

    ms.message("Loading hits from {} files".format(len(flist)))
    t0 = time()
    for f in flist:
        sample_name = f.split(".")[0]
        lsamples.append(sample_name)
        # keep track of total reads
        dsamples[sample_name] = 0

        with open(f, "r") as fin:
            for szl in fin:
                szl = szl.strip()
                if szl[0] == "#":
                    r = re.search("^\#total\_reads\=([0-9]+)", szl)
                    if r:
                        dsamples[sample_name] = r.group(1)

                    continue

                aln = szl.split("\t")

                if aln[0] not in dtargets:
                    dtargets[aln[0]] = target_idx
                    ltargets.append(aln[0])
                    target_idx += 1

                if float(aln[1]) > 0:
                    # record the hit as target index, sample index, hit count
                    lhits.append([dtargets[aln[0]], sample_idx, aln[1]])

        # increment sample index
        sample_idx += 1

    ms.time_diff(t0)

    ##
    ## finished parsing files
    ##

    ms.message("Writing results to your base.")
    t0 = time()

    # create count output
    with open(COUNTS, "w") as fout:
        for l in lhits:
            sz = "\t".join(map(str, l))
            fout.write(sz + "\n")

    # create samples file
    with open(SAMPLES, "w") as fout:
        for sid in lsamples:
            l = [sid, dsamples[sid]]
            fout.write("\t".join(map(str, l)))
            fout.write("\n")

    # create genes file
    with open(GENES, "w") as fout:
        for tid in ltargets:
            fout.write(tid + "\n")

    ms.time_diff(t0)

    return 0
Beispiel #6
0
def core(args):

    sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""]
    linen = 0
    rnum = 0
    tmpname = hashlib.md5(args.fastq).hexdigest()
    samout = "@HD\tVN:1.0\tSO:unsorted\n"

    # open input file
    if re.search("\.gz$", args.fastq):
        fin = gzip.open(args.fastq, "r")
    else:
        fin = open(args.fastq, "r")

    # open SAM output
    fout = open("{}.sam".format(tmpname), "w")
    fout.write(samout)

    t0 = time()

    for szl in fin:
        linen += 1
        if linen == 1:
            # read name
            rname = szl.strip()
        elif linen == 2:
            # read
            seq = szl.strip()
        elif linen == 4:
            qual = szl.strip()
            # get the sam alignment read for writing
            sam = list(sam0)
            sam[SAM_QNAME] = re.sub("^\@", "", rname)
            sam[SAM_SEQ] = seq
            sam[SAM_QUAL] = qual
            # write line out to file
            fout.write("\t".join(sam))
            fout.write("\n")
            # reset line counter
            linen = 0
            rnum += 1

        if rnum > 0 and (rnum % 1000000) == 0:
            ms.progress_message("parsed {} reads".format(rnum))

    ms.progress_message("parsed {} reads".format(rnum), last=True)

    ms.time_diff(t0)

    fout.close()
    fin.close()

    ms.message("Converting to BAM")
    t0 = time()

    t = args.t
    if t == 0:
        t = cpu_count() / 2
    elif t > cpu_cout():
        t = cpu_count()

    cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname)
    rres = utils.runcmd(cmd)
    if rres[0] != 0:
        ms.error_message("Failed to create BAM file!")
        return 1

    ms.time_diff(t0)

    unlink("{}.sam".format(tmpname))

    return 0