def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="BAM file or Use - for STDIN for SAM")
    parser.add_argument('--minimum_intron',
                        type=int,
                        default=68,
                        help="smallest intron")
    parser.add_argument('-o', '--output', help="Output file, gzip is okay")
    args = parser.parse_args()

    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')

    if args.input == '-':
        sh = SamStream(sys.stdin)
    else:
        sh = BAMFile(args.input)
    for e in sh:
        if not e.is_aligned(): continue
        gpd_line = e.get_target_transcript(
            min_intron=args.minimum_intron).get_gpd_line()
        of.write(gpd_line + "\n")
    sh.close()
    of.close()
def main():
    parser = argparse.ArgumentParser(
        description=
        'Based on Yunhaos ONT naming convention. i.e. BOWDEN04_20160603_FNFAD11879_MN16254_sequencing_run_R9_H1cDNA_SIRV_79593_ch49_read2100_strand_pass_2D or /^(\S+)_\d+_[^_]+_[^_]+[^_]+_[^_]+_[^_]+$/',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Input bam file")
    parser.add_argument('-o',
                        '--output',
                        required=True,
                        help="output directory")
    parser.add_argument(
        '--suffix',
        help="string to add before .bam: cellname.XXXXXX.sorted.bam")
    args = parser.parse_args()
    args.output = args.output.rstrip('/')
    nameprog = re.compile('^(\S+)_\d+_[^_]+_[^_]+_[^_]+_[^_]+_[^_]+$')
    if not os.path.exists(args.output): os.makedirs(args.output)
    bf = BAMFile(args.input)
    sorted_header_text = sort_header(bf.header_text)
    fhs = {}
    z = 0
    for e in bf:
        z += 1
        if z % 1000 == 0:
            sys.stderr.write(
                str(z) + " reads  " + str(len(fhs.keys())) + " cells      \r")
        m = nameprog.match(e.value('qname'))
        mol = '_nonont'
        if m:
            mol = m.group(1)
        ln = e.get_line()
        if mol not in fhs:
            fname = args.output + '/' + mol
            if args.suffix:
                fname += '.' + args.suffix
            fname += '.gz'
            of = gzip.open(fname, 'w')
            fhs[mol] = [of, fname]
            fhs[mol][0].write(sorted_header_text)
        fhs[mol][0].write(ln + "\n")
    sys.stderr.write("\n")
    z = 0
    for mol in fhs:
        z += 1
        fhs[mol][0].close()
        ofname = fhs[mol][1][:-2] + 'sorted'
        inf = gzip.open(fhs[mol][1])
        cmd1 = 'samtools view -Sb -'
        cmd2 = 'samtools sort - ' + ofname
        p2 = Popen(cmd2.split(), stdin=PIPE)
        p1 = Popen(cmd1.split(), stdin=inf, stdout=p2.stdin)
        p2.communicate()
        p1.communicate()
        inf.close()
        of.close()
        os.remove(fhs[mol][1])
        sys.stderr.write(
            str(z) + '/' + str(len(fhs.keys())) + " finished        \r")
    sys.stderr.write("\n")
Ejemplo n.º 3
0
def main(args):

    bf = BAMFile(args.input)
    chrlens = bf.get_header().get_sequence_lengths()
    of_chrlens = open(args.output, 'w')
    for qname in sorted(chrlens.keys()):
        of_chrlens.write(qname + "\t" + str(chrlens[qname]) + "\n")
    of_chrlens.close()
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Input bam file")
    parser.add_argument('-o',
                        '--output',
                        required=True,
                        help="output directory")
    parser.add_argument(
        '--suffix',
        help="string to add before .bam: smrtcellname.XXXXXX.sorted.bam")
    args = parser.parse_args()
    args.output = args.output.rstrip('/')
    nameprog = re.compile('^([^\/]+)\/\d+/')
    if not os.path.exists(args.output): os.makedirs(args.output)
    bf = BAMFile(args.input)
    sorted_header_text = sort_header(bf.header_text)
    fhs = {}
    z = 0
    for e in bf:
        z += 1
        if z % 1000 == 0:
            sys.stderr.write(
                str(z) + " reads  " + str(len(fhs.keys())) + " cells      \r")
        m = nameprog.match(e.value('qname'))
        mol = '_nonpacbio'
        if m:
            mol = m.group(1)
        ln = e.get_line()
        if mol not in fhs:
            fname = args.output + '/' + mol
            if args.suffix:
                fname += '.' + args.suffix
            fname += '.gz'
            of = gzip.open(fname, 'w')
            fhs[mol] = [of, fname]
            fhs[mol][0].write(sorted_header_text)
        fhs[mol][0].write(ln + "\n")
    sys.stderr.write("\n")
    z = 0
    for mol in fhs:
        z += 1
        fhs[mol][0].close()
        ofname = fhs[mol][1][:-2] + 'sorted'
        inf = gzip.open(fhs[mol][1])
        cmd1 = 'samtools view -Sb -'
        cmd2 = 'samtools sort - ' + ofname
        p2 = Popen(cmd2.split(), stdin=PIPE)
        p1 = Popen(cmd1.split(), stdin=inf, stdout=p2.stdin)
        p2.communicate()
        p1.communicate()
        inf.close()
        of.close()
        os.remove(fhs[mol][1])
        sys.stderr.write(
            str(z) + '/' + str(len(fhs.keys())) + " finished        \r")
    sys.stderr.write("\n")
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="BAM file input")
    parser.add_argument('--threads',
                        type=int,
                        default=cpu_count(),
                        help="Thread count")
    parser.add_argument('--tempdir',
                        help="location of temporary directory to use")
    parser.add_argument('-o', '--output', help="Output file name")
    args = parser.parse_args()
    if args.tempdir: args.tempdir = args.tempdir.rstrip('/')
    bf = BAMFile(args.input)
    seqs = bf.get_header().get_sequence_lengths()
    f = tempfile.NamedTemporaryFile(delete=False)
    for seq in seqs:
        f.write(seq + "\t" + str(seqs[seq]) + "\n")
    f.close()
    bf.close()
    fout = tempfile.NamedTemporaryFile(delete=False)
    cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel=' + str(args.threads)
    if args.tempdir: cmd += ' -T ' + args.tempdir
    global ps
    ps = Popen(cmd.split(), stdin=PIPE, stdout=fout)

    if args.threads > 1:
        poo = Pool(processes=args.threads)
    for seq in seqs:
        if args.threads > 1:
            poo.apply_async(do_seq,
                            args=(seq, args, f.name),
                            callback=do_output)
        else:
            res = do_seq(seq, args, f.name)
            do_output(res)
    if args.threads > 1:
        poo.close()
        poo.join()
    ps.communicate()
    fout.close()
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    cmd = 'sort -k 1,1 -k2,2n -k3,3n -S4G --parallel=' + str(args.threads)
    inf = open(fout.name)
    p = Popen(cmd.split(), stdout=PIPE, stdin=inf)
    for line in p.stdout:
        of.write(line)
    p.communicate()
    inf.close()
    of.close()
    os.unlink(f.name)
    os.unlink(fout.name)
def main():
  #do our inputs
  args = do_inputs()
  bf = BAMFile(args.input)
  if not args.all_alignments:
    if args.index_path:
      bf.read_index(args.index_path)
    else:
      bf.read_index(args.index_path)
  ls = LocusStream(bf)
  if args.output:
    args.output = open(args.output,'w')
  else:
    args.output = sys.stdout
  global of
  of = args.output
  z = 0
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for entries in ls:
    bedarray = []
    #print len(entries.get_payload())
    for e in entries.get_payload():
      if not args.all_alignments and not e.indexed_as_primary_alignment(): continue
      if not e.is_aligned(): continue
      tx = e.get_target_transcript(min_intron=args.minimum_intron_size)
      for exon in tx.exons:
        bedarray.append(exon.rng.copy())
        #print exon.rng.get_range_string()
    if len(bedarray) == 0: continue
    if args.threads > 1:
      p.apply_async(get_output,args=(bedarray,z,),callback=do_output)
    else:
      r = get_output(bedarray,z)
      do_output(r)
    z += 1
  if args.threads > 1:
    p.close()
    p.join()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
  args.output.close()
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN or specify a BAM file")
    parser.add_argument('-r',
                        '--reference',
                        help="Reference fasta",
                        required=True)
    args = parser.parse_args()

    ref = None
    if args.reference:
        ref = FastaData(open(args.reference, 'rb').read())

    if args.input == '-':
        args.input = SamStream(sys.stdin, reference=ref)
    else:
        args.input = BAMFile(args.input, reference=ref)
    for e in args.input:
        if e.is_aligned():
            print e.get_PSL()
Ejemplo n.º 8
0
def do_chunk(ilines, infile, args):
    ilines = [x.rstrip().split("\t") for x in ilines]
    coord = [int(x) for x in ilines[0][2:4]]
    bf = BAMFile(infile, blockStart=coord[0], innerStart=coord[1])
    results = []
    for i in range(0, len(ilines)):
        flag = int(ilines[i][5])
        e = bf.read_entry()
        #if not e: break
        #print {'qlen':e.get_original_query_length(),'alen':e.get_aligned_bases_count()}
        value = None
        if e.is_aligned():
            tx = e.get_target_transcript(args.minimum_intron_size)
            value = {
                'qrng': e.get_actual_original_query_range().get_range_string(),
                'tx': tx.get_gpd_line(),
                'flag': flag,
                'qlen': e.get_original_query_length(),
                'aligned_bases': e.get_aligned_bases_count()
            }
            results.append(
                e.value('qname') + "\t" +
                base64.b64encode(zlib.compress(pickle.dumps(value))))
            #results.append([e.value('qname'),zlib.compress(pickle.dumps(value))])
        else:
            value = {
                'qrng': '',
                'tx': '',
                'flag': flag,
                'qlen': e.get_original_query_length(),
                'aligned_bases': 0
            }
            results.append(
                e.value('qname') + "\t" +
                base64.b64encode(zlib.compress(pickle.dumps(value))))
            #results.append([e.value('qname'),zlib.compress(pickle.dumps(value))])
    return results
Ejemplo n.º 9
0
def main(args):
    # make our error profile report
    sys.stderr.write("Reading reference fasta\n")
    ref = FastaData(open(args.reference).read())
    sys.stderr.write("Reading alignments\n")
    epf = ErrorProfileFactory()
    if args.random:
        bf = None
        if args.input_index:
            bf = BAMFile(args.input,
                         reference=ref,
                         index_file=args.input_index)
            bf.read_index(index_file=args.input_index)
        else:
            bf = BAMFile(args.input, reference=ref)
            bf.read_index()
        if not bf.has_index():
            sys.stderr.write("Random access requires an index be set\n")
        z = 0
        strand = 'target'
        if args.query: strand = 'query'
        con = 0
        while True:
            rname = random.choice(bf.index.get_names())
            #print rname
            coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
            #print coord
            if not coord: continue
            e = bf.fetch_by_coord(coord)
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                if z % 100 == 1:
                    con = epf.get_min_context_count(strand)
                sys.stderr.write(
                    str(z) + " alignments, " + str(con) +
                    " min context coverage\r")
                if args.max_alignments <= z: break
                if args.stopping_point <= con: break

    else:
        bf = BAMFile(args.input, reference=ref)
        z = 0
        strand = 'target'
        if args.query: strand = 'query'
        con = 0
        for e in bf:
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                if z % 100 == 1:
                    con = epf.get_min_context_count(strand)
                sys.stderr.write(
                    str(z) + " alignments, " + str(con) +
                    " min context coverage\r")
                if args.max_alignments <= z: break
                if args.stopping_point <= con: break
    sys.stderr.write("\n")
    sys.stderr.write('working with:' + "\n")
    sys.stderr.write(
        str(z) + " alignments, " + str(con) + " min context coverage" + "\n")
    epf.write_context_error_report(args.tempdir + '/err.txt', strand)

    for ofile in args.output:
        cmd = args.rscript_path + ' ' + os.path.dirname(
            os.path.realpath(__file__)
        ) + '/plot_base_error_context.r ' + args.tempdir + '/err.txt ' + ofile + ' '
        if args.scale:
            cmd += ' '.join([str(x) for x in args.scale])
        sys.stderr.write(cmd + "\n")
        call(cmd.split())
    sys.stderr.write("finished\n")
    if args.output_raw:
        of = open(args.output_raw, 'w')
        with open(args.tempdir + "/err.txt") as inf:
            for line in inf:
                of.write(line)
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
Ejemplo n.º 10
0
def main(args):
  sys.stderr.write("Read reference fasta\n")
  fasta = FastaData(open(args.reference_fasta).read())
  sys.stderr.write("Read alignment file\n")
  bf = BAMFile(args.bam_input,reference=fasta)
  bf.read_index()
  total_qualities = []
  for j in range(0,100):
    total_qualities.append([])
  ef = ErrorProfileFactory()
  mincontext = 0
  alignments = 0
  for i in range(0,args.max_alignments):
    rname = random.choice(bf.index.get_names())
    coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
    if not coord: continue
    bam = bf.fetch_by_coord(coord)
    qual = bam.value('qual')
    do_qualities(total_qualities,qual)
    if not bam.is_aligned(): continue
    alignments += 1
    ef.add_alignment(bam)
    if i%100 == 0:
      mincontext = ef.get_min_context_count('target')
      if mincontext:
        if mincontext >= args.min_context and alignments >= args.min_alignments: break
    sys.stderr.write(str(i+1)+" lines   "+str(alignments)+"/"+str(args.min_alignments)+" alignments   "+str(mincontext)+"/"+str(args.min_context)+" mincontext        \r")
  sys.stderr.write("\n")
  sys.stderr.write(str(mincontext)+" minimum contexts observed\n")
  target_context = ef.get_target_context_error_report()
  general_error_stats = ef.get_alignment_errors().get_stats()
  general_error_report = ef.get_alignment_errors().get_report()
  # convert report to table
  general_all = [x.split("\t") for x in general_error_report.rstrip().split("\n")]
  general_head = general_all[0]
  #print [y for y in general_all[1:]]
  general_data = [[y[0],y[1],int(y[2]),int(y[3])] for y in general_all[1:]]
  general_error_report = {'head':general_head,'data':general_data}
  quality_counts = []
  for vals in total_qualities:
    garr = []
    grp = {}
    for v in vals:
      if v[0] not in grp: grp[v[0]] = {}# check ordinal
      if v[1] not in grp[v[0]]: grp[v[0]][v[1]] = 0 # run length
      grp[v[0]][v[1]]+=1
    for ordval in sorted(grp.keys()):
      for runlen in sorted(grp[ordval].keys()):
        garr.append([ordval,runlen,grp[ordval][runlen]])
    quality_counts.append(garr)
  #Quailty counts now has 100 bins, each has an ordered array of
  # [ordinal_quality, run_length, observation_count]
  
  # Can prepare an output
  output = {}
  output['quality_counts'] = quality_counts
  output['context_error'] = target_context
  output['alignment_error'] = general_error_report
  output['error_stats'] = general_error_stats
  of = None
  if args.output[-3:]=='.gz':
    of = gzip.open(args.output,'w')
  else: of = open(args.output,'w')
  of.write(base64.b64encode(zlib.compress(json.dumps(output)))+"\n")
  of.close()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Ejemplo n.º 11
0
def main(args):

    sys.stderr.write("Reading our reference Fasta\n")
    ref = FastaData(open(args.reference, 'rb').read())
    sys.stderr.write("Finished reading our reference Fasta\n")
    bf = None
    if args.input_index:
        bf = BAMFile(args.input, reference=ref, index_file=args.input_index)
        bf.read_index(index_file=args.input_index)
    else:
        bf = BAMFile(args.input, reference=ref)
        bf.read_index()
    epf = ErrorProfileFactory()
    if args.random:
        if not bf.has_index():
            sys.stderr.write(
                "Random access requires our format of index bgi to be set\n")
            sys.exit()
        z = 0
        while True:
            rname = random.choice(bf.index.get_names())
            coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
            if not coord: continue
            e = bf.fetch_by_coord(coord)
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                #print z
                if z % 100 == 1:
                    con = epf.get_alignment_errors().alignment_length
                    if args.max_length <= con: break
                    sys.stderr.write(
                        str(con) + "/" + str(args.max_length) +
                        " bases from " + str(z) + " alignments\r")
        sys.stderr.write("\n")
    else:
        z = 0
        for e in bf:
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                #print z
                if z % 100 == 1:
                    con = epf.get_alignment_errors().alignment_length
                    if args.max_length <= con: break
                    sys.stderr.write(
                        str(con) + "/" + str(args.max_length) +
                        " bases from " + str(z) + " alignments\r")
        sys.stderr.write("\n")
    of = open(args.tempdir + '/report.txt', 'w')
    of.write(epf.get_alignment_errors().get_report())
    of.close()

    for ofile in args.output:
        cmd = args.rscript_path + ' ' + os.path.dirname(
            os.path.realpath(__file__)
        ) + '/plot_alignment_errors.r ' + args.tempdir + '/report.txt ' + ofile + ' '
        if args.scale:
            cmd += ' '.join([str(x) for x in args.scale])
        sys.stderr.write(cmd + "\n")
        call(cmd.split())

    if args.output_raw:
        of = open(args.output_raw, 'w')
        with open(args.tempdir + "/report.txt") as inf:
            for line in inf:
                of.write(line)
        of.close()
    if args.output_stats:
        of = open(args.output_stats, 'w')
        of.write(epf.get_alignment_errors().get_stats())
        of.close()
    sys.stderr.write("finished\n")
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)