Beispiel #1
0
def load(fname):

    dtid = {}
    dtid2gname = {}
    tid_order = []
    dattrs = set(["transcript_id", "gene_id", "gene_name"])
    gene_names = set()
    n = 0

    with open(fname, "r") as fin:

        for szl in fin:
            if szl[0] == "#":
                # comment line
                continue

            gg = gt.GtfRow(szl)

            if gg.feature() != "exon":
                continue

            n += 1
            if (n % 100000) == 0:
                ms.progress_message(
                    "parsed {} lines. {} transcripts in {} genes".format(
                        n, len(dtid.keys()), len(gene_names)))

            tid = gg.transcript_id()

            dattrs.update(gg.attr.keys())

            if tid not in dtid:
                dtid[tid] = []
                tid_order.append(tid)

            dtid[tid].append(gg)

            if tid not in dtid2gname:
                dtid2gname[tid] = gg.gene_name()
                gene_names.add(gg.gene_name())

    ms.progress_message("parsed {} lines. {} transcripts in {} genes".format(
        n, len(dtid.keys()), len(gene_names)),
                        last=True)

    return dtid, dtid2gname, dattrs, tid_order
Beispiel #2
0
        mstat = Stats()

        ms.message("parsing {}".format(argv[i]))
        t0 = time()
        with ps.AlignmentFile(argv[i]) as fin:
            rnames = ps_tools.get_alignmentfile_rnames(fin)
            numhit = 0

            for aln in fin:
                mstat.lines += 1

                if (aln.flag & 0x100) == 0:
                    mstat.reads += 1

                if mstat.lines % 1000000 == 0:
                    ms.progress_message("parsed {} lines".format(mstat.lines))

                if aln.flag & 0x4:
                    continue

                if aln.flag & 0x100:
                    mstat.secondary_alignments += 1
                else:
                    mstat.primary_aligned += 1

                if aln.flag & 0x10:
                    mstat.second_strand += 1
                else:
                    mstat.first_strand += 1

                if aln.flag & 0x1:
Beispiel #3
0
def core(left, right, args):

    ##
    ## create queue and child process for compressing the fastq files
    ##

    bc_len = args.barcode_length
    umi_len = args.umi_length

    tasks = JoinableQueue()
    p = Process(target=gz_worker, args=(tasks, ))
    p.daemon = True
    p.start()

    ##
    ## input files are paired from the sequencer so we just have to read through them
    ## and write them back out

    for i in range(len(left)):

        m1 = left[i]
        m2 = right[i]

        # pick apart the name of the second file to build the output name
        base = basename(m2)
        path = dirname(m2)

        base_parts = base.split(".")
        stub = base_parts[0]

        outfile = "{}_prepped.fastq".format(stub)

        if outfile == m1 or outfile == m2:
            ms.error_message(
                "Output file path matches input file path. WTF? {}".format(
                    outfile))
            sys.exit(1)

        if isfile(outfile):
            ms.warning_message(
                "output file exists. overwriting. {}".format(outfile))

        fout = open(outfile, "w")

        try:
            ms.message("Processing {}".format(m1))
            with open_reads(m1) as fin1, open_reads(m2) as fin2:

                nidx = 0
                nreads = 0
                lread = []
                for szl2 in fin2:

                    if (nreads % 1000000) == 0:
                        ms.progress_message("Parsed {} reads".format(nreads))

                    nidx += 1
                    if nidx == 1:
                        # read name line
                        rname = szl2.strip().split()
                        rname = rname[0]
                        # read two lines from the barcode file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline().strip()
                        # this is the barcode so we can pick it apart. I'm going to put the cell barcode
                        # at the front of the read so that I can maybe leverage samtools sort to sort
                        # barcodes together for me prior to parsing cells out
                        rname_tmp = re.sub("^\@", "", rname)
                        rname = "@{}:{}".format(szl1[0:bc_len], rname_tmp)
                        # 20180226
                        # moved the cell barcode to the front of the read name so we only
                        # need to write the umi at the end and not both
                        #rname += ":{}:{}".format(szl1[0:16], szl1[16:len(szl1)])
                        rname += ":{}".format(szl1[bc_len:(bc_len + umi_len)])
                        lread.append(rname + "\n")
                        # read the remaining lines for this read from the barcodes file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline()
                    elif nidx < 4:
                        lread.append(szl2)

                    if nidx == 4:
                        # finished with read
                        lread.append(szl2)
                        fout.write("".join(lread))
                        nidx = 0
                        lread = []
                        nreads += 1

            ms.progress_message("Parsed {} reads".format(nreads), last=True)

        except:
            fout.close()
            sys.exit(1)

        fout.close()

        #ms.message("compressing {}".format(outfile))
        #system("gzip -f {}".format(outfile))
        tasks.put(outfile)

    tasks.put(None)
    ms.message("Waiting for gzip compression to complete.")
    tasks.join()
    p.join()

    # done

    return 0
Beispiel #4
0
def core(args):

    # variables

    # barcode dict to count reads per barcode
    bc = defaultdict(int)
    # barcode dict to track distinct umi and count them
    bc_umi = {}
    bc_keep = set()
    num_bc = 0
    lnum = 0
    lbc = ""
    lbc_last = ""

    # for output conversion
    file_queue = JoinableQueue()
    p = None
    pool = []

    ##
    ## we need to index all barcodes and track umi per barcode. if these pickle
    ## files exist we can use them
    ##

    # we have to index

    #
    # parse the alignments. in this loop we only extract the cell barcode and the umi
    # plus record the file position offsets for barcodes. the dict that is built
    # is indexed by the barcodes and each element contains a list of file offsets for
    # reads that came from that barcode. we also get all of the distinct umis collected
    # per barcode in this loop in order to estimate the actual cell count before
    # writing all of the read files out to disk
    ms.message('Counting per-barcode reads and UMI')
    t0 = time()
    with ps.AlignmentFile(args.bam, "rb", check_header=False,
                          check_sq=False) as fin:

        for aln in fin:
            lnum += 1

            nparts = (aln.query_name).split(":")
            # barcode is first
            lbc = nparts[0]
            # umi is last
            umi = nparts[-1]

            bc[lbc] += 1

            if lbc not in bc_umi:
                bc_umi[lbc] = {}

            if umi not in bc_umi[lbc]:
                bc_umi[lbc][umi] = 0

            bc_umi[lbc][umi] += 1

            if (lnum % 1000000) == 0:
                ms.progress_message("parsed {} reads".format(lnum))

    # final progress message and total time of parsing
    ms.progress_message("parsed {} reads".format(lnum), last=True)
    sys.stderr.write("{} sec\n".format(time() - t0))

    t0 = time()

    #
    # implement cell number detection per 10x.
    # here's what happens. you take the 'exp-cells' value (expected cells)
    # and multiply that by 0.01 to get an index. sort the barcodes and the
    # barcode umi counts in descending order and jump to the index you just
    # calculated and then take that index's umi count. scale that count
    # by 0.1. now you take as many cells, starting from the top of the umi
    # count sorted list, that have at least that many UMI.  that's literally
    # how they do it.
    #

    t0 = time()
    ms.message("Determining cell count")

    #
    # write a file that will contain the cell id, umi count and read count
    # for each cell id. might be informative...who knows.
    with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout:
        bc_umi_counts = []

        fout.write("barcode\tumi_count\tdistinct_reads\n")

        for lbc in bc.keys():
            num_umi = len(bc_umi[lbc].keys())
            bc_umi_counts.append([lbc, num_umi])
            # write the cell id, distinct umi count and total read count to file
            fout.write("\t".join(map(str, [lbc, num_umi, bc[lbc]])))
            fout.write("\n")

    #
    # sort by umi count in descending order and threshold
    bc_umi_counts.sort(key=lambda x: x[1], reverse=True)
    exp_cells = int(math.floor(args.exp_cells * 0.01 - 1))

    num_reads = 0
    num_umi = 0
    num_bc = len(bc.keys())

    i = 0
    while True:
        # check if the current barcode is below threshold..
        if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10:
            break

        # count umi and count distinct reads
        lbc = bc_umi_counts[i][0]
        # keep track of the barcodes that we will retain
        num_reads += bc[lbc]
        num_umi += len(bc_umi[lbc].keys())

        i += 1

    #
    # number of actual cells is 'i' because 'i' is incremented before
    # checking if the umi count passes the threshold. i-1 is the index
    # of the last cell we would accept
    num_cells = i

    #
    # now we can generate a summary for the detected cells
    with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout:

        fout.write("estimated_cells\t{}\n".format(num_cells))
        fout.write("total_reads\t{}\n".format(num_reads))
        fout.write("total_umi\t{}\n".format(num_umi))
        fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells))
        fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells))

        # find the median barcode and corresponding read count
        if num_cells % 2 == 0:
            # even count
            median_idx = num_cells / 2
        else:
            median_idx = num_cells / 2 + 1

        median_lbc = bc_umi_counts[median_idx][0]
        fout.write("median_reads_per_cell\t{}\n".format(bc[median_lbc]))

    #
    # let user know what's up
    sys.stderr.write("{} sec\n".format(time() - t0))
    sys.stderr.write("Total distinct barcodes:  {}\n".format(num_bc))
    sys.stderr.write("Cell number estimate:     {}\n".format(num_cells))

    if args.estimate_only:
        ms.message("Done.")
        return 0

    if args.force_cells is not None:
        # change number of cells to either the total barcodes or the
        # value provided by the user, whichever is smaller
        num_cells = min([args.force_cells, num_bc])
        sys.stderr.write("Forced cell output:       {}\n".format(num_cells))

    t0 = time()

    # make set of the barcodes that we will keep
    bc_keep = set()
    for i in range(num_cells):
        bc_keep.add(bc_umi_counts[i][0])

    #
    # now we can dig back into the sorted bam file to export all of the individual cell lines

    # launch processes for gzip compression
    for i in range(args.p):
        p = Process(target=compress_reads, args=(file_queue, ))
        p.daemon = True
        p.start()
        pool.append(p)

    with ps.AlignmentFile(args.bam, "rb", check_header=False,
                          check_sq=False) as fin:

        szout = ""
        bc_out = 0
        for aln in fin:
            lnum += 1

            nparts = (aln.query_name).split(":")
            # barcode is first
            lbc = nparts[0]
            # umi is last
            #umi = nparts[-1]

            if lbc != lbc_last:
                if lbc_last in bc_keep:

                    # write buffered data to file...
                    bc_out += 1
                    fname = "{}/{}.fastq".format(args.outpath, lbc_last)
                    ms.progress_message("writing {} ({} of {})".format(
                        fname, bc_out, num_cells))
                    with open(fname, "w") as fout:
                        fout.write(szout)

                    file_queue.put(fname)

                szout = ""

            # keep it?
            if random.random() < args.samplerate:
                # passes sampling limit
                if lbc in bc_keep:
                    # convert line to fasta and append it to the output string
                    szout += fastq_from_aln(aln)

            lbc_last = lbc

    if lbc_last in bc_keep:
        bc_out += 1
        # write buffered data to file...
        fname = "{}/{}.fastq".format(args.outpath, lbc_last)
        ms.progress_message("writing {} ({} of {})".format(
            fname, bc_out, num_cells),
                            last=True)
        with open(fname, "w") as fout:
            fout.write(szout)

        # put fname in the queue to be compressed
        file_queue.put(fname)

    for p in pool:
        file_queue.put(None)

    file_queue.join()

    for p in pool:
        p.join()

    ms.message("finished!")

    return 0
Beispiel #5
0
def parse_alignments(fname, annot):

    # re.sub("\_[0-9]+\-[0-9]+$", "", sz)

    phash = {}

    qname = ""
    qname_last = ""
    rcount = 0

    with ps.AlignmentFile(fname) as fin:
        rnames = ps_tools.get_alignmentfile_rnames(fin)
        qbuff = set()

        for aln in fin:
            if aln.flag & 0x4:
                continue

            qname = aln.query_name
            if qname != qname_last:
                qbuff = list(qbuff)

                if len(qbuff) > 0:
                    # deal with it

                    rcount += 1
                    if (rcount % 1000000) == 0:
                        ms.progress_message("parsed {} reads".format(rcount))

                    for k in qbuff:
                        tmp = k.split("|")
                        g_source = tmp[0]
                        g_target = tmp[1]

                        if g_source not in phash:
                            phash[g_source] = {}

                        if g_target not in phash[g_source]:
                            phash[g_source][g_target] = 0

                        phash[g_source][g_target] += 1

                qbuff = set()

            # get the target
            read_source = re.sub("\_[0-9]+\-[0-9]+$", "", aln.query_name)
            target = rnames[aln.reference_id]

            g_source = annot[read_source]
            g_target = annot[target]

            qbuff.add("{}|{}".format(g_source, g_target))
            qname_last = qname

    # out of loop. handle the final read

    qbuff = list(qbuff)
    if len(qbuff) > 0:
        # deal with it

        for k in qbuff:
            tmp = k.split("|")
            g_source = tmp[0]
            g_target = tmp[1]

            if g_source not in phash:
                phash[g_source] = {}

            if g_target not in phash[g_source]:
                phash[g_source][g_target] = 0

            phash[g_source][g_target] += 1

    ms.progress_message("parsed {} reads [fin]".format(rcount), last=True)

    return phash
Beispiel #6
0
def core(args):

    annot = {}
    tid2gname = {}

    stub = hashlib.md5(args.fasta).hexdigest()

    fixed_case = "{}.ref.fa".format(stub)
    #gene_seq = "{}.gene.fa".format(stub)
    gene_shred = "{}.shred.fa".format(stub)
    gene_final = "{}.final.fa".format(stub)
    sub_chars = re.escape("[]{}\|/?!@#$%^&*()+=.") + "\s"

    fset = [fixed_case, "{}.fai".format(fixed_case), gene_shred, gene_final]

    ##
    # parse refflat
    annot, tid2gname = load_refflat(args.ref)

    ##
    # change case of input fasta
    ms.message("Converting all reference bases to uppercase")
    rres = fasta_fix_case(args.fasta, fixed_case)

    if args.just_alignment:
        args.bbmap = True

    if args.just_quantification:
        args.quantify = True

    if not args.just_alignment and not args.just_quantification:

        ##
        # creat worker process to deal with all of the shredding
        tasks = JoinableQueue()
        results = Queue()
        p = None
        pool = []

        for i in range(args.p):
            p = Process(target=worker, args=(
                tasks,
                results,
            ))
            p.daemon = True
            p.start()
            pool.append(p)

        ##
        # get to work!
        i = 0
        n = len(annot.keys())
        ms.message("Starting main loop")
        for gid in annot.keys():
            i += 1
            if (i % 3) == 0:
                ms.progress_message("processing {}. {} of {}".format(
                    gid, i, n))

            gidHat = re.sub("[{}]".format(sub_chars), "", gid)

            # export all sequences belonging to this gene
            gene_seq = "{}.{}.fa".format(stub, gidHat)

            rres = samtools_faidx(fixed_case, gene_seq, annot[gid])
            tasks.put([stub, gidHat])

        for p in pool:
            tasks.put(None)

        ms.progress_message("Waiting for shredding to complete", last=True)

        tasks.join()
        for p in pool:
            p.join()

        ms.message("done")

        results.put(None)

        while True:
            fname = results.get()
            if fname is None:
                break

            if not isfile(fname):
                continue

            ms.message("Joining {}".format(fname))

            rres = cat_result(fname, args.o)
            unlink(fname)

    # done!

    bam_out = re.sub("\.fasta$", ".bam", args.o)

    if args.bbmap:
        if not isfile(args.o):
            if not isfile("{}.gz".format(args.o)):
                ms.error_message(
                    "Shredded reads file doest not exist, gzipped or not ({})".
                    format(args.o))
                return 1
            else:
                # gunzip the reads file
                runcmd("gunzip {}.gz".format(args.o))

        ms.message("Aligning shreds back against the reference")

        rres = bbmap(args.o, fixed_case, bam_out, args.t)

    if args.quantify:
        if not isfile(bam_out):
            ms.error_message(
                "Expected alignment file does not exist ({})".format(bam_out))
            return 1

        ms.message("Parsing alignments")
        pares = parse_alignments(bam_out, tid2gname)
        rres = process_pares(pares)

        tsvout = re.sub("\.bam$", ".tsv", bam_out)

        # output:
        # gene_name, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts
        ms.message("Writing mappability report")
        with open(tsvout, "w") as fout:

            fout.write("#read_length={}\n".format(args.l))
            fout.write("\t".join([
                "#gene_name", "total_reads", "min_mapp", "max_mapp",
                "mean_mapp", "most_similar", "all_targets", "target_counts"
            ]) + "\n")

            for gname in sorted(rres.keys()):

                total_reads = rres[gname]['total_reads']
                if total_reads == 0:
                    ms.warning_message("{} had zero reads".format(gname))
                min_mapp = 1
                max_mapp = 1
                mean_mapp = 1
                most_similar = "na"
                all_genes = "na"
                all_gene_counts = "na"

                if len(rres[gname]['target']) > 0:
                    mapp = []
                    for n in rres[gname]['target_count']:
                        if total_reads > 0:
                            mapp.append(1 - n * 1.0 / total_reads)
                        else:
                            mapp.append(0)

                    min_mapp = min(mapp)
                    max_mapp = max(mapp)
                    mean_mapp = np.mean(mapp)

                    for i in range(len(mapp)):
                        if mapp[i] == min_mapp:
                            most_similar = rres[gname]['target'][i]

                    all_genes = ",".join(rres[gname]['target'])
                    all_gene_counts = ",".join(
                        map(str, rres[gname]['target_count']))

                lout = [
                    gname, total_reads, min_mapp, max_mapp, mean_mapp,
                    most_similar, all_genes, all_gene_counts
                ]
                fout.write("\t".join(map(str, lout)) + "\n")

    if not args.just_quantification and not args.just_alignment:
        if args.z:
            if isfile("{}.gz".format(args.o)):
                unlink("{}.gz".format(args.o))

            cmd = "gzip {}".format(args.o)
            runcmd(cmd)

    #
    # clear out temp files
    for fname in fset:
        if isfile(fname):
            unlink(fname)

    return 0
Beispiel #7
0
def core(args):

    sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""]
    linen = 0
    rnum = 0
    tmpname = hashlib.md5(args.fastq).hexdigest()
    samout = "@HD\tVN:1.0\tSO:unsorted\n"

    # open input file
    if re.search("\.gz$", args.fastq):
        fin = gzip.open(args.fastq, "r")
    else:
        fin = open(args.fastq, "r")

    # open SAM output
    fout = open("{}.sam".format(tmpname), "w")
    fout.write(samout)

    t0 = time()

    for szl in fin:
        linen += 1
        if linen == 1:
            # read name
            rname = szl.strip()
        elif linen == 2:
            # read
            seq = szl.strip()
        elif linen == 4:
            qual = szl.strip()
            # get the sam alignment read for writing
            sam = list(sam0)
            sam[SAM_QNAME] = re.sub("^\@", "", rname)
            sam[SAM_SEQ] = seq
            sam[SAM_QUAL] = qual
            # write line out to file
            fout.write("\t".join(sam))
            fout.write("\n")
            # reset line counter
            linen = 0
            rnum += 1

        if rnum > 0 and (rnum % 1000000) == 0:
            ms.progress_message("parsed {} reads".format(rnum))

    ms.progress_message("parsed {} reads".format(rnum), last=True)

    ms.time_diff(t0)

    fout.close()
    fin.close()

    ms.message("Converting to BAM")
    t0 = time()

    t = args.t
    if t == 0:
        t = cpu_count() / 2
    elif t > cpu_cout():
        t = cpu_count()

    cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname)
    rres = utils.runcmd(cmd)
    if rres[0] != 0:
        ms.error_message("Failed to create BAM file!")
        return 1

    ms.time_diff(t0)

    unlink("{}.sam".format(tmpname))

    return 0