def main():
    global sam_input_list, preprocess_list, output_list, leftout_list

    file = open(sam_input_file, "r")  # open sam file
    in_sam = Reader(file)  # convert to a Reader object

    for read in in_sam:
        preprocess_list.append([read.qname, read.seq])

    print(f"Input sam file contains {len(preprocess_list)} reads\n")

    print("Leftout reads(dont have either start_seg or end_seg):")
    for i in preprocess_list:
        if (start_seg not in i[1]) or (end_seg not in i[1]):
            print(i)
            leftout_list.append(f">{i[0]}\n")
            leftout_list.append(f"{i[1]}\n")
        else:
            # now define the range of the seg we need to extract
            output_list.append(f">{i[0]}\n")
            start_index = i[1].find(start_seg)
            end_index = i[1].find(end_seg) + len(end_seg)
            output_list.append(f"{i[1][start_index:end_index]}\n")

    print(f"total {len(leftout_list)/2} leftout reads")
    print(f"total {len(output_list)/2} output reads")

    write(output_list, fas_output_file)
    write(leftout_list, working_dir + "leftout_noSpike.fasta")
Ejemplo n.º 2
0
def change_chr(long_sam, dict_chr_split, wd, threads, verbose, type_reads):
    if "long" in type_reads:
        outfile = os.path.join(wd, 'long_reads_mapped')
    if "short" in type_reads:
        outfile = os.path.join(wd, 'short_reads_mapped')
    out_file = open(outfile, 'w')
    in_file = open(long_sam, "r")
    in_sam = Reader(in_file)
    header = in_sam.header
    sq = header['@SQ']
    dict_chr = {}
    for c in sq:
        single_elm = c.split(":")
        if single_elm[1] in dict_chr_split:
            single_elm[1] = dict_chr_split[single_elm[1]]
            change_chr = ":".join(single_elm)
            dict_chr[change_chr] = sq[c]
    header_new = OrderedDict({
        '@CO': header['@CO'],
        '@HD': header['@HD'],
        '@PG': header['@PG'],
        '@SQ': OrderedDict(dict_chr)
    })
    out_sam = Writer(out_file, header_new)
    for line in in_sam:
        if line.rname in dict_chr_split:
            name = dict_chr_split[line.rname]
            line.rname = name
            out_sam.write(line)
    out_sam.close()

    bam_final = sam_to_sorted_bam(outfile, threads, wd, verbose)

    return bam_final
Ejemplo n.º 3
0
def sam2table(filename, inputdir, outputdir):

    autosomeXY = list(range(1, 23))
    autosomeXY.append('X')
    autosomeXY.append('Y')
    autosomeXY = ['chr' + str(x) for x in autosomeXY]

    readsname = os.path.splitext(filename)[0]

    samfile = open(inputdir + filename, 'r')
    tablefile = open(outputdir + readsname + '.txt', 'w')
    errorfile = open(outputdir + readsname + '_error.sam', 'w')
    sam = Reader(samfile)
    error = Writer(errorfile)
    tablefile.write('\t'.join([
        'ID', 'FILENAME', 'READNAME', 'CHR', 'POS', 'INS_STRAND', 'RE', 'R1',
        'R2', 'TLEN', 'CIGAR_R1', 'CIGAR_R2', 'MDFLAG_R1', 'MDFLAG_R2',
        'BARCODE', 'BARCODE_Q'
    ]) + '\n')
    id_count = 0
    bar = tnrange(int(count_samlines(inputdir + filename) / 2), desc=readsname)
    for i in bar:
        r1 = next(sam)
        r2 = next(sam)
        if r1.rname == r2.rname and r1.rname in autosomeXY:
            if ((r1.flag in [99, 83]) and (r2.flag in [147, 163]) and
                (r1.qname.split('__abq:')[0] == r2.qname.split('__abq:')[0])):
                id_count += 1
                bc = r2.qname.split('__abq:')[2]
                bc_q = r2.qname.split('__abq:')[3]
                re_seq = r1.qname.split('__abq:')[1]
                if r1.reverse:
                    # it's strand of insertion (not read)
                    strand = '+'
                    pos = r2.pos + abs(r1.tlen) - 1
                else:
                    strand = '-'
                    pos = r1.pos
                mdflag_r1 = r1._tags[1]
                mdflag_r2 = r1._tags[1]
                tablefile.write('\t'.join([
                    str(id_count), readsname,
                    r1.qname.split('__abq:')[0], r1.rname,
                    str(pos), strand, re_seq, r1.seq, r2.seq,
                    str(abs(r1.tlen)), r1.cigar, r2.cigar, mdflag_r1,
                    mdflag_r2, bc, bc_q
                ]) + '\n')
            else:
                error.write(r1)
                error.write(r2)
        else:
            error.write(r1)
            error.write(r2)
    samfile.close()
    tablefile.close()
    errorfile.close()
    return (0)
Ejemplo n.º 4
0
def bamtofastq(bam, verbose):
    fasta = bam + ".fasta"
    in_file = open(bam, 'r')
    in_sam = Reader(in_file)
    with open(fasta, "w") as output_handle:
        for line in in_sam:
            if line.mapped:
                record = SeqRecord(Seq(str(line.seq)), name=str(line.qname))
                SeqIO.write(record, output_handle, "fasta")
    return fasta
Ejemplo n.º 5
0
def soft_clip(long_sam):

    in_file = open(long_sam, "r")
    in_sam = Reader(in_file)
    soft_clip_file = "test.fasta"
    with open(soft_clip_file, "w") as fh:
        for line in in_sam:
            if "S" in line.cigars[0][1]:
                if line.flag == 0 or line.flag == 16:
                    fh.write(line.rname + "\n")
                    fh.write(line.seq + "\n")

    return soft_clip_file
Ejemplo n.º 6
0
def main():
    if len(sys.argv) != 2:
        print("Usage: python3 add_cb_ub_tags.py  BAM")
        sys.exit(1)
    in_file = open(sys.argv[1], 'r')
    sample = sys.argv[1].split(".bam", 1)[0]
    print("Sample: " + sample)
    out_file = sample + "_withtags.sam"
    in_sam = Reader(in_file)
    x = next(in_sam)
    print(x.tags)
    barcode_tag = 'CB'
    umi_tag = 'UB'
    with Reader(open(sys.argv[1])) as in_bam:
        with Writer(open(out_file, 'w'), in_bam.header) as out_sam:
            for read in in_bam:
                #print(read.qname)
                #read[umi_tag] = read.qname.split(":")[2] # add the umi tag
                #read[barcode_tag] = read.qname.split(":")[1] # add the barcode tag
                read[umi_tag] = "dummy_umi"  # add the umi tag
                read[barcode_tag] = sample  # add the barcode tag
                out_sam.write(read)
Ejemplo n.º 7
0
def change_chr_to_seq(short_reads, dict_ref_name, wd, threads, verbose):
    sys.stdout.write('###CHANGING CHROMOSOME NAMES IN BAM###\n')

    sam_link = os.path.join(wd, short_reads.split("/")[-1])
    if not os.path.exists(sam_link):
        os.link(short_reads, sam_link)
    outfile = sam_link + ".changed.sorted.sam"
    dict_invert_seq = {}
    for key in dict_ref_name:
        dict_invert_seq[dict_ref_name[key]] = key

    out_file = open(outfile, 'w')
    in_file = open(sam_link, "r")
    in_sam = Reader(in_file)
    header = in_sam.header
    sq = header['@SQ']
    dict_chr = {}
    for c in sq:
        single_elm = c.split(":")
        if single_elm[1] in dict_invert_seq:
            single_elm[1] = dict_invert_seq[single_elm[1]]
            change_chr = ":".join(single_elm)
            dict_chr[change_chr] = sq[c]
    header_new = OrderedDict({
        '@CO': header['@CO'],
        '@HD': header['@HD'],
        '@PG': header['@PG'],
        '@SQ': OrderedDict(dict_chr)
    })
    out_sam = Writer(out_file, header_new)
    for line in in_sam:
        if line.rname in dict_invert_seq:
            name = dict_invert_seq[line.rname]
            line.rname = name
            out_sam.write(line)
    out_sam.close()

    bam_final = sam_to_sorted_bam(outfile, threads, wd, verbose)
    sys.stdout.write('###DONE CHANGING CHROMOSOME NAMES IN BAM###\n')

    return bam_final


#if __name__ == '__main__':
#    dict_ref_name = {"seq1" : "scaffold_3"}
#    change_chr_to_seq(*sys.argv[1:], dict_ref_name)
Ejemplo n.º 8
0
Archivo: cli.py Proyecto: yodeng/fastqp
def run(arguments):
    """ read FASTQ or SAM and tabulate basic metrics
    arguments is a dictionary so that we can call this as a function """
    arguments['input'] = argparse.FileType('r')(arguments['input'])
    arguments['text'] = argparse.FileType('w')(arguments['text'])
    args = Bunch(arguments)  # convert back to an argparse namespace
    time_start = time.time()
    if args.input.name != '<stdin>':
        bsize = os.path.getsize(args.input.name)

    est_counter = int()
    sample_lengths = list()
    sample_binsizes = list()
    act_nlines = int()
    name, ext = os.path.splitext(args.input.name)
    if (args.leftlimit > 0) and (args.rightlimit > 0):
        if args.rightlimit < args.leftlimit:
            sys.exit("Left limit must be less than right limit.\n")
    if args.type:
        ext = '.' + args.type
    if ext not in ['.fq', '.fastq', '.sam', '.bam', '.gz'
                   ] and args.input.name != '<stdin>':
        sys.exit(
            "Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n")

    if args.name:
        sample_name = args.name
    else:
        sample_name = args.input.name

    # estimate the number of lines in args.input if we can
    if ext in ['.fastq', '.fq']:
        with FastqReader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write(
                    "At {bytes:.0f} bytes per read of {len:.0f} length "
                    "we estimate {est:,} reads in input file.\n".format(
                        bytes=mean_bentry, len=mean_len, est=est_nlines))
    elif ext == '.sam':
        with Reader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write(
                    "At {bytes:.0f} bytes per read of {len:.0f} length "
                    "we estimate {est:,} reads in input file.\n".format(
                        bytes=mean_bentry, len=mean_len, est=est_nlines))
    elif ext == '.bam':
        est_nlines = sum(bam_read_count(args.input.name))
        if not args.quiet:
            sys.stderr.write(
                "{est:,} reads in input file.\n".format(est=est_nlines))
    elif ext == '.gz':
        if args.binsize:
            n = args.binsize
            est_nlines = None
            if not args.quiet:
                sys.stderr.write(
                    "Reading from gzipped file, bin size (-s) set to {binsize:n}.\n"
                    .format(binsize=n))
        else:
            sys.stderr.write(
                "Gzipped file detected. Reading file to determine bin size (-s).\n"
            )
            p1 = Popen(shlex.split('gzip -dc %s' % args.input.name),
                       stdout=PIPE)
            p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE)
            est_nlines, _ = p2.communicate()
            est_nlines = int(est_nlines) // 4
            if not args.quiet:
                sys.stderr.write(
                    "{est:,} reads in input file.\n".format(est=est_nlines))
    elif name == '<stdin>':
        if args.binsize:
            n = args.binsize
        else:
            n = 1
        if not args.quiet:
            sys.stderr.write(
                "Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".
                format(binsize=n))
        est_nlines = None
    if est_nlines == 0:
        sys.exit(
            "The input file appears empty. Please check the file for data.")
    elif est_nlines is not None:
        # set up factor for sampling bin size
        if args.binsize:
            n = args.binsize
        else:
            nf = math.floor(est_nlines / args.nreads)
            if nf >= 1:
                n = int(nf)
            else:
                n = 1
        if not args.quiet:
            sys.stderr.write(
                "Bin size (-s) set to {binsize:n}.\n".format(binsize=n))

    if ext in ['.sam', '.bam']:
        infile = Reader(args.input)
    else:
        infile = FastqReader(args.input, ext=ext)

    read_len = defaultdict(int)
    cycle_nuc = defaultdict(lambda: defaultdict(int))
    cycle_qual = defaultdict(lambda: defaultdict(int))
    cycle_gc = defaultdict(int)
    cycle_kmers = defaultdict(lambda: defaultdict(int))
    cycle_mismatch = {
        'C': defaultdict(lambda: defaultdict(int)),
        'G': defaultdict(lambda: defaultdict(int)),
        'A': defaultdict(lambda: defaultdict(int)),
        'T': defaultdict(lambda: defaultdict(int))
    }

    if args.count_duplicates:
        try:
            from pybloom import ScalableBloomFilter
            bloom_filter = ScalableBloomFilter(
                mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        except ImportError:
            sys.exit("--count-duplicates option requires 'pybloom' package.\n")

    duplicates = 0
    percent_complete = 10
    reads = infile.subsample(n)

    for read in reads:
        if isinstance(read, Sam):
            if args.aligned_only and not read.mapped:
                continue
            elif args.unaligned_only and read.mapped:
                continue
            if read.reverse:
                seq = read.seq[::-1]
                qual = read.qual[::-1]
            else:
                seq = read.seq
                qual = read.qual
        else:
            seq = read.seq
            qual = read.qual

        # Set up limits
        if (args.leftlimit == 1) and (args.rightlimit < 0):
            pass
        elif (args.leftlimit >= 1) and (args.rightlimit > 0):
            try:
                seq = seq[args.leftlimit - 1:args.rightlimit]
                qual = qual[args.leftlimit - 1:args.rightlimit]
            except IndexError:
                act_nlines += n
                continue
        elif (args.leftlimit > 1) and (args.rightlimit < 0):
            try:
                seq = seq[args.leftlimit - 1:]
                qual = qual[args.leftlimit - 1:]
            except IndexError:
                act_nlines += n
                continue
        if len(seq) == 0:
            act_nlines += n
            continue
        cycle_gc[gc(seq)] += 1

        if args.count_duplicates:
            if seq in bloom_filter:
                duplicates += 1
            else:
                bloom_filter.add(seq)

        for i, (s, q) in enumerate(zip(seq, qual)):
            cycle_nuc[args.leftlimit + i][s] += 1
            cycle_qual[args.leftlimit + i][q] += 1
        read_len[len(qual)] += 1

        for i, kmer in enumerate(window(seq, n=args.kmer)):
            cycle_kmers[args.leftlimit + i][kmer] += 1

        if isinstance(read, Sam) and read.mapped:
            try:
                ref = read.parse_md()
                for i, (s, r) in enumerate(zip(seq, ref)):
                    if s != r:
                        try:
                            cycle_mismatch[r][args.leftlimit + i][s] += 1
                        except KeyError:
                            pass
            except KeyError:
                pass

        if est_nlines is not None:
            if (act_nlines / est_nlines) * 100 >= percent_complete:
                sys.stderr.write(
                    "Approximately {0:n}% complete at "
                    "read {1:,} in {2}\n".format(
                        percent_complete, act_nlines,
                        time.strftime('%H:%M:%S',
                                      time.gmtime(time.time() - time_start))))
                percent_complete += 10
        act_nlines += n

    positions = [k for k in sorted(cycle_qual.keys())]
    depths = [read_len[k] for k in sorted(read_len.keys())]

    basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())]
    bases = set(list(itertools.chain.from_iterable(basecalls)))
    #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())]
    map(padbases(bases), cycle_nuc.values())

    quantile_values = [0.05, 0.25, 0.5, 0.75, 0.95]
    quantiles = []
    # replace ASCII quality with integer
    for _, v in sorted(cycle_qual.items()):
        for q in tuple(v.keys(
        )):  # py3 keys are iterator, so build a tuple to avoid recursion
            v[ord(str(q)) - 33] = v.pop(q)
        line = [percentile(v, p) for p in quantile_values]
        quantiles.append(line)

    # build kmer set of known adapter sequences
    adapter_kmers = set()
    for adapter in all_adapter_sequences:
        for kmer in window(adapter, n=args.kmer):
            adapter_kmers.add(kmer)

    # test for nonuniform kmer profiles and calculate obs/exp
    observed_expected = dict()
    all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())]
    kmers = set(list(itertools.chain.from_iterable(all_kmers)))
    bad_kmers = []
    sequenced_bases = sum((l * n for l, n in read_len.items()))
    priors = tuple(map(float, args.base_probs.split(',')))
    for kmer in kmers:
        kmer_counts = [(i, cycle_kmers[i][kmer])
                       for i in sorted(cycle_kmers.keys())]
        expected_fraction = reduce(
            mul, (p**kmer.count(b)
                  for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1)
        expected = expected_fraction * sequenced_bases
        observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected
        slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts))
        if abs(slope) > 2 and p_value < 0.05:
            bad_kmers.append((kmer, slope, p_value))
    bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10]

    pos_gc = []
    for i in positions:
        try:
            pg = sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([
                cycle_nuc[i]['C'], cycle_nuc[i]['G'], cycle_nuc[i]['A'],
                cycle_nuc[i]['T']
            ]) * 100
        except ZeroDivisionError:
            pg = 0  # https://github.com/mdshw5/fastqp/issues/26
        pos_gc.append(pg)

    # see http://vita.had.co.nz/papers/tidy-data.pdf
    args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format(
        row=sample_name, column='reads', pos='None', value=act_nlines))

    for cycle, count in read_len.items():
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='read_len', pos=cycle, value=count))

    for i, position in enumerate(positions):
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='q05', pos=position,
            value=quantiles[i][0]))
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='q25', pos=position,
            value=quantiles[i][1]))
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='q50', pos=position,
            value=quantiles[i][2]))
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='q75', pos=position,
            value=quantiles[i][3]))
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='q95', pos=position,
            value=quantiles[i][4]))
    for base in bases:
        for position in positions:
            args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
                row=sample_name,
                column=base,
                pos=position,
                value=cycle_nuc[position][base]))
    for position in positions:
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name,
            column='pos_gc',
            pos=position,
            value=pos_gc[position - 1]))
    for i in range(101):
        args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(
            row=sample_name, column='read_gc', pos=i, value=cycle_gc[i]))

    for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]):
        args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format(
            row=sample_name, column=kmer, pos='None', value=obs_exp))

    if args.count_duplicates:
        args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format(
            row=sample_name,
            column='duplicate',
            pos='None',
            value=duplicates / act_nlines))

    from zipfile import ZipFile
    with ZipFile(args.output + '.zip', mode='w') as zip_archive:
        fig_kw = {'figsize': (8, 6)}
        qualplot(positions, quantiles, zip_archive, fig_kw)
        median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw)
        qualmap(cycle_qual, zip_archive, fig_kw)
        depthplot(read_len, zip_archive, fig_kw)
        gcplot(positions, pos_gc, zip_archive, fig_kw)
        gcdist(cycle_gc, zip_archive, fig_kw)
        nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw)
        kmerplot(positions, cycle_kmers, zip_archive,
                 [fields[0] for fields in bad_kmers], fig_kw)
        adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive,
                       fig_kw)
        if isinstance(infile, Reader):
            mismatchplot(positions, cycle_mismatch, zip_archive, fig_kw)
    time_finish = time.time()
    elapsed = time_finish - time_start
    if not args.quiet:
        sys.stderr.write(
            "There were {counts:,} reads in the file. Analysis finished in {sec}.\n"
            .format(counts=act_nlines,
                    sec=time.strftime('%H:%M:%S', time.gmtime(elapsed))))
        if len(bad_kmers) > 0:
            for kmer in bad_kmers:
                sys.stderr.write(
                    "KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n"
                    % (kmer))
        if median_qual < args.median_qual:
            sys.stderr.write(
                "QualityWarning: median base quality score is %s.\n" %
                median_qual)
Ejemplo n.º 9
0
import sys
import simplesam
from simplesam import Reader, Writer

BAMFILE = sys.argv[1]
SAMFILE = sys.argv[2]

in_file = open(BAMFILE, 'r')
in_sam = Reader(in_file)
x = next(in_sam)
x.tags

with Reader(open(BAMFILE)) as in_bam:
    with Writer(open(SAMFILE, 'w'), in_bam.header) as out_sam:
        for read in in_bam:
            read["UB"] = read.qname.split(":")[2]  # add the umi tag
            read["CB"] = read.qname.split(":")[1]  # add the barcode tag
            out_sam.write(read)
Ejemplo n.º 10
0
Archivo: cli.py Proyecto: longrw/fastqp
def run(args):
    """ read FASTQ or SAM and tabulate basic metrics """
    time_start = time.time()
    if args.input.name != '<stdin>':
        bsize = os.path.getsize(args.input.name)

    est_counter = int()
    sample_lengths = list()
    sample_binsizes = list()
    act_nlines = int()
    name, ext = os.path.splitext(args.input.name)
    if (args.leftlimit > 0) and (args.rightlimit > 0):
        if args.rightlimit < args.leftlimit:
            sys.exit("Left limit must be less than right limit.\n")
    if args.type:
        ext = '.' + args.type
    if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>':
        sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n")

    if args.name:
        sample_name = args.name
    else:
        sample_name = args.input.name

    # estimate the number of lines in args.input if we can
    if ext in ['.fastq','.fq']:
        with FastqReader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext  == '.sam':
        with Reader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext == '.bam':
        est_nlines = sum(bam_read_count(args.input.name))
        if not args.quiet:
            sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif ext == '.gz':
        if args.binsize:
            n = args.binsize
            est_nlines = None
            if not args.quiet:
                sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        else:
            sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n")
            p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE)
            p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE)
            est_nlines, _ = p2.communicate()
            est_nlines = int(est_nlines) // 4
            if not args.quiet:
                sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif name == '<stdin>':
        if args.binsize:
            n = args.binsize
        else:
            n = 1
        if not args.quiet:
            sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        est_nlines = None
    if est_nlines is not None:
        # set up factor for sampling bin size
        if args.binsize:
            n = args.binsize
        else:
            nf = math.floor(est_nlines / args.nreads)
            if nf >= 1:
                n = int(nf)
            else:
                n = 1
        if not args.quiet:
            sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n))

    if ext in ['.sam', '.bam']:
        infile = Reader(args.input)
    else:
        infile = FastqReader(args.input, ext=ext)

    read_len = defaultdict(int)
    cycle_nuc = defaultdict(lambda: defaultdict(int))
    cycle_qual = defaultdict(lambda: defaultdict(int))
    cycle_gc = defaultdict(int)
    cycle_kmers = defaultdict(lambda: defaultdict(int))
    cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)),
                      'G': defaultdict(lambda: defaultdict(int)),
                      'A': defaultdict(lambda: defaultdict(int)),
                      'T': defaultdict(lambda: defaultdict(int))}

    if args.count_duplicates:
        try:
            from pybloom import ScalableBloomFilter
            bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        except ImportError:
            sys.exit("--count-duplicates option requires 'pybloom' package.\n")

    duplicates = 0
    percent_complete = 10
    reads = infile.subsample(n)

    for read in reads:
        if isinstance(read, Sam):
            if args.aligned_only and not read.mapped:
                continue
            elif args.unaligned_only and read.mapped:
                continue
            if read.reverse:
                seq = read.seq[::-1]
                qual = read.qual[::-1]
            else:
                seq = read.seq
                qual = read.qual
        else:
            seq = read.seq
            qual = read.qual

        # Set up limits
        if (args.leftlimit == 1) and (args.rightlimit < 0):
            pass
        elif (args.leftlimit >= 1) and (args.rightlimit > 0):
            try:
                seq = seq[args.leftlimit - 1:args.rightlimit]
                qual = qual[args.leftlimit - 1:args.rightlimit]
            except IndexError:
                act_nlines += n
                continue
        elif (args.leftlimit > 1) and (args.rightlimit < 0):
            try:
                seq = seq[args.leftlimit - 1:]
                qual = qual[args.leftlimit - 1:]
            except IndexError:
                act_nlines += n
                continue
        if len(seq) == 0:
            act_nlines += n
            continue
        cycle_gc[gc(seq)] += 1

        if args.count_duplicates:
            if seq in bloom_filter:
                duplicates += 1
            else:
                bloom_filter.add(seq)

        for i, (s, q) in enumerate(zip(seq, qual)):
            cycle_nuc[args.leftlimit + i][s] += 1
            cycle_qual[args.leftlimit + i][q] += 1
        read_len[len(qual)] += 1

        for i, kmer in enumerate(window(seq, n=args.kmer)):
            cycle_kmers[args.leftlimit+i][kmer] += 1

        if isinstance(read, Sam) and read.mapped:
            try:
                ref = read.parse_md()
                for i, (s, r) in enumerate(zip(seq, ref)):
                    if s != r:
                        try:
                            cycle_mismatch[r][args.leftlimit+i][s] += 1
                        except KeyError:
                            pass
            except KeyError:
                pass


        if est_nlines is not None:
            if (act_nlines / est_nlines) * 100 >= percent_complete:
                sys.stderr.write("Approximately {0:n}% complete at "
                                 "read {1:,} in {2}\n".format(percent_complete,
                                                              act_nlines,
                                                              time.strftime('%H:%M:%S',
                                                                            time.gmtime(time.time()-time_start))))
                percent_complete += 10
        act_nlines += n

    positions = [k for k in sorted(cycle_qual.keys())]
    depths = [read_len[k] for k in sorted(read_len.keys())]

    basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())]
    bases = set(list(itertools.chain.from_iterable(basecalls)))
    #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())]
    map(padbases(bases), cycle_nuc.values())

    quantile_values = [0.05,0.25,0.5,0.75,0.95]
    quantiles = []
    ## replace ASCII quality with integer
    for _, v in sorted(cycle_qual.items()):
        for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion
            v[ord(str(q)) - 33] = v.pop(q)
        line = [percentile(v, p) for p in quantile_values]
        quantiles.append(line)

    # build kmer set of known adapter sequences
    adapter_kmers = set()
    for adapter in all_adapter_sequences:
        for kmer in window(adapter, n=args.kmer):
            adapter_kmers.add(kmer)

    # test for nonuniform kmer profiles and calculate obs/exp
    observed_expected = dict()
    all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())]
    kmers = set(list(itertools.chain.from_iterable(all_kmers)))
    bad_kmers = []
    sequenced_bases = sum((l * n for l, n in read_len.items()))
    priors = tuple(map(float, args.base_probs.split(',')))
    for kmer in kmers:
        kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())]
        expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1)
        expected = expected_fraction * sequenced_bases
        observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected
        slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts))
        if abs(slope) > 2 and p_value < 0.05:
            bad_kmers.append((kmer, slope, p_value))
    bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10]
    pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'],
                                                              cycle_nuc[i]['G'],
                                                              cycle_nuc[i]['A'],
                                                              cycle_nuc[i]['T']]) * 100 for i in positions]

    # see http://vita.had.co.nz/papers/tidy-data.pdf
    sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines))

    for cycle, count in read_len.items():
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle,
                                                               value=count))

    for i, position in enumerate(positions):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q05', pos=position,
                                                               value=quantiles[i][0]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q25', pos=position,
                                                               value=quantiles[i][1]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q50', pos=position,
                                                               value=quantiles[i][2]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q75', pos=position,
                                                               value=quantiles[i][3]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q95', pos=position,
                                                               value=quantiles[i][4]))
    for base in bases:
        for position in positions:
            sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                                   column=base, pos=position,
                                                                   value=cycle_nuc[position][base]))
    for position in positions:
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='cycle_gc', pos=position,
                                                               value=cycle_gc[position]))
    for i in range(101):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='read_gc', pos=i,
                                                               value=cycle_gc[i]))

    for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]):
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name,
                                                               column=kmer, pos='None',
                                                               value=obs_exp))

    if args.count_duplicates:
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines))


    from zipfile import ZipFile
    with ZipFile(args.output + '.zip', mode='w') as zip_archive:
        fig_kw = {'figsize':(8, 6)}
        qualplot(positions, quantiles, zip_archive, fig_kw)
        median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw)
        qualmap(cycle_qual, zip_archive, fig_kw)
        depthplot(read_len, zip_archive, fig_kw)
        gcplot(positions, pos_gc, zip_archive, fig_kw)
        gcdist(cycle_gc, zip_archive, fig_kw)
        nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw)
        kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw)
        adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw)
        if isinstance(infile, Reader):
            mismatchplot(positions
                         , cycle_mismatch, zip_archive, fig_kw)
    time_finish = time.time()
    elapsed = time_finish - time_start
    if not args.quiet:
        sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines,
                                                                                                                       sec=time.strftime('%H:%M:%S',
                                                                                                                                         time.gmtime(elapsed))
        ))
        if len(bad_kmers) > 0:
            for kmer in bad_kmers:
                sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer))
        if median_qual < args.median_qual:
            sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--trusted-cutoff', type=int, default=5)
    parser.add_argument("ht",
                        type=str,
                        help="Counting bloom filter for the reads")
    parser.add_argument("bam_file", type=str, help="bam read mapping file")
    parser.add_argument("--json", action='store_true', help="output JSON")

    args = parser.parse_args()

    ht = khmer.load_countgraph(args.ht)
    samfile = Reader(open(args.bam_file, 'r'))

    k = ht.ksize()
    seq_cnt = 0
    dropped_seqs = 0
    base_cnt = {}
    state_cnts = {}
    trans_cnts = {}

    total_bases = 0.0

    for rec in samfile:
        seq = rec.seq
        cigar = rec.cigar

        seq_cnt += 1
        if 'N' in seq:
            dropped_seqs += 1
            continue

        states = extract_cigar(rec.cigars)

        kmer = seq[:k]
        state = states[k] + trusted_str(ht.count(kmer), args.trusted_cutoff)

        state_cnts[state] = state_cnts.get(state, 0) + 1
        base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1

        for i in range(1, len(seq) - k - 1):
            total_bases += 1
            kmer = seq[i:i + k]
            cov = ht.get(kmer)

            last_state = state
            state = states[i] + trusted_str(cov, args.trusted_cutoff)

            trans = last_state + '-' + state
            trans_cnts[trans] = trans_cnts.get(trans, 0) + 1

            state_cnts[state] = state_cnts.get(state, 0) + 1
            base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1

    if not args.json:
        print("kmer size=", k)
        print("seq count=", seq_cnt, "dropped seqs=", dropped_seqs)
        print("base counts=", base_cnt)
        print("state counts=", state_cnts)
        print("trans counts=", trans_cnts)

    if not args.json:

        trans_probs = collections.defaultdict(float(0))

        for trans in sorted(trans_cnts.keys()):
            start_state = trans.split('-')[0]
            trans_probs[trans] = trans_cnts[trans] / float(
                state_cnts[start_state])
            print('{0}\t{1:0.7f}'.format(trans, trans_probs[trans]))

        print('static double trans_default[] = { log2{0:0.7f}, log2{1:0.7f}, ' \
            'log2{2:0.7f}, log2{3:0.7f}, log2{4:0.7f}, ' \
            'log2(5:0.7f},'.format(trans_probs['M_t-M_t'],
                                   trans_probs['M_t-Ir_t'],
                                   trans_probs[
                'M_t-Ig_t'], trans_probs['M_t-M_u'],
                trans_probs['M_t-Ir_u'],
                trans_probs['M_t-Ig_u']))
        print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
            trans_probs['Ir_t-M_t'], trans_probs['Ir_t-Ir_t'],
            trans_probs['Ir_t-M_u'], trans_probs['Ir_t,Ir_u']))
        print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
            trans_probs['Ig_t-M_t'], trans_probs['Ig_t-Ig_t'],
            trans_probs['Ig_t-M_u'], trans_probs['Ig_t,Ig_u']))
        print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f}, '\
            'log2{4:0.7f}, log2(5:0.7f},'.format(
                trans_probs['M_u-M_t'], trans_probs['M_u-Ir_t'],
                trans_probs['M_u-Ig_t'], trans_probs['M_u-M_u'],
                trans_probs['M_u-Ir_u'], trans_probs['M_u-Ig_u']))
        print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
            trans_probs['Ir_u-M_t'], trans_probs['Ir_u-Ir_t'],
            trans_probs['Ir_u-M_u'], trans_probs['Ir_u,Ir_u']))
        print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
            trans_probs['Ig_u-M_t'], trans_probs['Ig_u-Ig_t'],
            trans_probs['Ig_u-M_u'], trans_probs['Ig_u,Ig_u']))
        print('};')
    else:
        params = {
            'scoring_matrix': [
                -0.06642736173897607, -4.643856189774724, -7.965784284662087,
                -9.965784284662087
            ],
            'transition_probabilities': ((
                log(trans_cnts['M_t-M_t'] / float(state_cnts['M_t']), 2),
                log(trans_cnts['M_t-Ir_t'] / float(state_cnts['M_t']), 2),
                log(trans_cnts['M_t-Ig_t'] / float(state_cnts['M_t']), 2),
                log(trans_cnts['M_t-M_u'] / float(state_cnts['M_t']), 2),
                log(trans_cnts['M_t-Ir_u'] / float(state_cnts['M_t']), 2),
                log(trans_cnts['M_t-Ig_u'] / float(state_cnts['M_t']), 2),
            ), (
                log(trans_cnts['Ir_t-M_t'] / float(state_cnts['Ir_t']), 2),
                log(trans_cnts['Ir_t-Ir_t'] / float(state_cnts['Ir_t']), 2),
                log(trans_cnts['Ir_t-M_u'] / float(state_cnts['Ir_t']), 2),
                log(trans_cnts['Ir_t-Ir_u'] / float(state_cnts['Ir_t']), 2),
            ), (
                log(trans_cnts['Ig_t-M_t'] / float(state_cnts['Ig_t']), 2),
                log(trans_cnts['Ig_t-Ig_t'] / float(state_cnts['Ig_t']), 2),
                log(trans_cnts['Ig_t-M_u'] / float(state_cnts['Ig_t']), 2),
                log(trans_cnts['Ig_t-Ig_u'] / float(state_cnts['Ig_t']), 2),
            ), (
                log(trans_cnts['M_u-M_t'] / float(state_cnts['M_u']), 2),
                log(trans_cnts['M_u-Ir_t'] / float(state_cnts['M_u']), 2),
                log(trans_cnts['M_u-Ig_t'] / float(state_cnts['M_u']), 2),
                log(trans_cnts['M_u-M_u'] / float(state_cnts['M_u']), 2),
                log(trans_cnts['M_u-Ir_u'] / float(state_cnts['M_u']), 2),
                log(trans_cnts['M_u-Ig_u'] / float(state_cnts['M_u']), 2),
            ), (
                log(trans_cnts['Ir_u-M_t'] / float(state_cnts['Ir_u']), 2),
                log(trans_cnts['Ir_u-Ir_t'] / float(state_cnts['Ir_u']), 2),
                log(trans_cnts['Ir_u-M_u'] / float(state_cnts['Ir_u']), 2),
                log(trans_cnts['Ir_u-Ir_u'] / float(state_cnts['Ir_u']), 2),
            ), (
                log(trans_cnts['Ig_u-M_t'] / float(state_cnts['Ig_u']), 2),
                log(trans_cnts['Ig_u-Ig_t'] / float(state_cnts['Ig_u']), 2),
                log(trans_cnts['Ig_u-M_u'] / float(state_cnts['Ig_u']), 2),
                log(trans_cnts['Ig_u-Ig_u'] / float(state_cnts['Ig_u']), 2),
            ))
        }
        print(
            json.dumps(params,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
Ejemplo n.º 12
0
def main(ext_args=None):
    from transcoorder import __version__
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('gtf',
                        type=str,
                        help='GTF file containing transcripts')
    parser.add_argument('bam',
                        type=argparse.FileType('r'),
                        help="SAM or BAM files aligned to transcriptome")
    parser.add_argument('fasta',
                        type=Fasta,
                        help="FASTA format assembly coresponding to GTF")
    parser.add_argument('-o',
                        '--out',
                        type=argparse.FileType('w'),
                        default='-',
                        help="output file for genomic SAM (default: stdout)")
    parser.add_argument(
        '-t',
        '--tag-name',
        type=str,
        default='ZT',
        help=
        "SAM tag name for storing transcript identifier. default: %(default)s")
    parser.add_argument('--debug',
                        action="store_true",
                        help="enable debugging")
    parser.add_argument('--version',
                        action="version",
                        version=__version__,
                        help="display version number")
    # print help usage if no arguments are supplied
    if len(sys.argv) == 1 and not ext_args:
        parser.print_help()
        sys.exit(1)
    elif ext_args:
        args = parser.parse_args(ext_args)
    else:
        args = parser.parse_args()

    try:
        db = FeatureDB(args.gtf + '.db')
    except ValueError:
        sys.stderr.write("building sqlite database for %s..." % args.gtf)
        db = create_db(args.gtf,
                       args.gtf + '.db',
                       disable_infer_transcripts=True,
                       disable_infer_genes=True)

    header = build_sam_header_from_fasta(args.fasta)
    with Reader(args.bam) as bamfile, Writer(args.out, header) as outfile:
        try:
            read_count = len(bamfile)
        except NotImplementedError:
            read_count = None
        with tqdm(total=read_count, unit='read') as pbar:
            for read in bamfile:
                features = cache_gtf_features(db, read.rname)
                if features == None:
                    if args.debug:
                        sys.stderr.write("%s not found in %s\n" %
                                         (read.rname, args.gtf))
                else:
                    transcript, genome_offset, transcript_coords = features
                    read = transcript_sam_to_genomic_sam(
                        read, transcript, genome_offset, transcript_coords)
                    if read is not None:
                        outfile.write(read)
                pbar.update(1)
Ejemplo n.º 13
0
from simplesam import Reader, Writer
import inspect
import sys, os, fileinput, string

in_file = open(sys.argv[1], 'r')
in_sam = Reader(in_file)
out_file = open('full_ecoli_mapped_q10_truth.txt', 'w')
# out_sam = Writer(out_file)

x = next(in_sam)

try:
    while (x.qname != ''):
        #if(x.reverse):
        #	out_file.write("+" + " ")
        #else:
        #	out_file.write("-" + " ")
        out_file.write(x.rname + " ")
        out_file.write(x.qname + " ")
        out_file.write(str(x.pos) + " ")
        out_file.write(str(x.pos + len(x.seq)) + "\n")
        #print str(type(x))
        x = next(in_sam)
except:
    print("Long read alignment ground truth generated")

in_file.close()
out_file.close()
Ejemplo n.º 14
0
def analyzeReads(myIntermedFileName, myRefSeq, myRefCGindices):
    locusUMIreadsDict = {}  # locus : UMI : allReadsforthatUMI

    readDict = {}  # fragName:totalReads
    # initialize readDict
    for frag in myRefSeq:
        readDict[frag] = 0

    infile = open(myIntermedFileName, 'r')
    samFile = Reader(infile)

    for forwardRead in samFile:
        if forwardRead.mapped and forwardRead.paired:  # makes sure reads map to a reference seq and are paired
            reverseRead = samFile.next()

            #### UMI...
            if forwardRead.rname == reverseRead.rname:  ### this is ADDED 11/20/19 to make sure both the forward and the reverse read map to the same fragment

                # make sure read name, UMI are same
                if forwardRead.qname != reverseRead.qname:
                    print("READS NOT PAIRED, out of sync")
                    print("forwardRead: ", forwardRead.qname, "reverseRead: ",
                          reverseRead.qname)
                    print("EXITING...")
                    sys.exit()

                fUMI = forwardRead.qname.split("+")[1]
                rUMI = reverseRead.qname.split("+")[1]

                # if "N" not in fUMI and "N" not in rUMI: ### ADDED 11/21/19 - PREVENTS US FROM ANALYZING READS THAT HAVE "N"s IN THE UMIs

                forwardMethylation = Read(forwardRead, myRefSeq,
                                          myRefCGindices)

                reverseMethylation = Read(reverseRead, myRefSeq,
                                          myRefCGindices)

                myLocus = forwardMethylation.locus

                #ADDED - keeping track of total number of reads - PUT THIS AT THE END
                readDict[myLocus] += 1

                locusCGindices = myRefCGindices[myLocus]

                myUMI = forwardMethylation.umi

                consensusIndexString = ""
                consensusMethString = ""
                consensusMethIndices = []
                consensusUnmethIndices = []

                for index in locusCGindices:
                    if index >= forwardMethylation.startCoord and index < reverseMethylation.startCoord and index in forwardMethylation.methIndices:
                        consensusIndexString += (str(index) + "Z")
                        consensusMethString += "Z"
                        consensusMethIndices.append(index)
                    elif index > forwardMethylation.endCoord and index <= reverseMethylation.endCoord and index in reverseMethylation.methIndices:
                        consensusIndexString += (str(index) + "Z")
                        consensusMethString += "Z"
                        consensusMethIndices.append(index)
                    elif index >= forwardMethylation.startCoord and index >= reverseMethylation.startCoord and index <= forwardMethylation.endCoord and index <= reverseMethylation.endCoord and index in forwardMethylation.methIndices and index in reverseMethylation.methIndices:  #index >= forwardMethylation.startCoord was in here twice. changed second one to index >= reverseMethylation.startCoord
                        consensusIndexString += (str(index) + "Z")
                        consensusMethString += "Z"
                        consensusMethIndices.append(index)
                    else:
                        consensusIndexString += (str(index) + "z")
                        consensusMethString += "z"
                        consensusUnmethIndices.append(index)

                if myLocus in locusUMIreadsDict:
                    if myUMI in locusUMIreadsDict[myLocus]:
                        locusUMIreadsDict[myLocus][myUMI].append(
                            consensusMethString)
                    else:
                        locusUMIreadsDict[myLocus][myUMI] = [
                            consensusMethString
                        ]
                else:
                    readList = [consensusMethString]
                    myUMIdict = {myUMI: readList}
                    locusUMIreadsDict[myLocus] = myUMIdict

    myResults = ReadsChunkResults(myIntermedFileName, readDict,
                                  locusUMIreadsDict)
    return myResults
Ejemplo n.º 15
0
                        file=tsv)
        tsv.flush()
    # Check for missing genes
    missed = gene_ids.difference(processed)
    if len(missed) > 0:
        print(
            f"No FASTA sources for the following IDs: {', '.join(missed)}\n" +
            'These genes are absent from genes FASTA, but may be used for ' +
            'read mapping if data for them is available.',
            file=stderr)

# TODO: load SAM data for Illumina reads

# Loading PacBio reads
print('Loading PacBio hits...', file=stderr)
pb_reader = Reader(open(args.p))
mapped_hits = {x: [] for x in regions_of_interest}
read_counts = {x: 0 for x in gene_ids}
mapped_hit_names = {x: [] for x in gene_ids}
for hit in pb_reader:
    if hit.rname in features_by_source:
        hit_coord = (hit.pos, hit.pos + len(hit))
        for gene in features_by_source[hit.rname]:
            if overlap((gene.start, gene.end), hit_coord):
                mapped_hits[hit.rname].append(hit_coord)
                read_counts[gene.get_id_prefix()] += 1
                mapped_hit_names[gene.get_id_prefix()].append(hit.qname)
if args.hit_ids:
    for gene in mapped_hit_names:
        if len(mapped_hit_names[gene]) > 0:
            for read_id in mapped_hit_names[gene]: