def test_runner(self): """Test CombineRunner.""" ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30) d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks") split_dirs = [op.join(d, b, "cluster_out") for b in ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")] print split_dirs out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir") rmpath(out_combined_dir) mkdir(out_combined_dir) obj = CombineRunner(combined_dir=out_combined_dir, sample_name="mysample", split_dirs=split_dirs, ipq_opts=ipq_opts) obj.run() expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq, obj.all_consensus_isoforms_fa, obj.all_cluster_report_fn, obj.all_cluster_summary_fn) self.assertTrue(all([op.exists(f) for f in expected_out_fns])) expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470', 'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457', 'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471'] self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms) self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms) expected_lq_isoforms_num = 73 self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num) expected_consensus_isoforms_num = 79 self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = [ 'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)), 'ls *.iit *meta', 'sleep 3', 'cd %s' % real_upath(cwd) ] execute(' && '.join(cmd_args)) cmd_args = [ 'gmap', '-D {d}'.format(d=real_upath(gmap_db_dir)), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', '--max-intronlength-ends 200000', # for long genes real_upath(gmap_input_filename), '>', real_upath(unsorted_sam_filename), '2>{log}'.format(log=real_upath(log_filename)) ] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name), 'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd] execute(' && '.join(cmd_args)) cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename)] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename)] # Call gmap to map isoforms to reference and output sam. execute(' '.join(cmd_args)) # Copy SAM headers copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename) # Call sort to sort gmap output sam file cmd_args = ['sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'', '>>', sam_filename] execute(' '.join(cmd_args)) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) cmd_args = [ 'gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename) ] # Call gmap to map isoforms to reference and output sam. execute(' '.join(cmd_args)) # Copy SAM headers copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename) # Call sort to sort gmap output sam file cmd_args = [ 'sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'', '>>', sam_filename ] execute(' '.join(cmd_args)) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def test_iter_gmap_sam(self): """ test iter_gmap_sam, which takes a sorted gmap sam file as input, and iterates over a group of overlapping sam records (which supposed to belong to the same isoform family.) """ ignored_ids_txt = op.join(OUT_DIR, 'iter_gmap_sam.ignored.txt') rmpath(ignored_ids_txt) ignored_ids_writer = open(ignored_ids_txt, 'w') groups = _get_sam_groups(ignored_ids_writer) ignored_ids_writer.close() self.assertTrue(op.exists(ignored_ids_txt)) ignored_ids = [line.split(' ')[0] for line in open(ignored_ids_txt, 'r')] self.assertEqual(len(ignored_ids), 108) self.assertEqual(len(groups), 9) expected_plus_lens = [10, 2, 129, 31, 0, 0, 348, 141, 0] self.assertEqual([len(g["+"]) for g in groups], expected_plus_lens) expected_minus_lens = [77, 36, 11, 0, 6, 9, 2, 2, 72] self.assertEqual([len(g["-"]) for g in groups], expected_minus_lens) self.assertTrue(all([r.sID == 'SIRV1' for r in groups[0]["+"]])) self.assertTrue(all([r.sID == 'SIRV2' for r in groups[1]["+"]])) self.assertTrue(all([r.sID == 'SIRV3' for r in groups[2]["+"]])) self.assertTrue(all([r.sID == 'SIRV4' for r in groups[3]["+"]])) self.assertTrue(all([r.sID == 'SIRV4' for r in groups[4]["-"]])) self.assertTrue(all([r.sID == 'SIRV4' for r in groups[5]["-"]])) self.assertTrue(all([r.sID == 'SIRV5' for r in groups[6]["+"]])) self.assertTrue(all([r.sID == 'SIRV6' for r in groups[7]["+"]])) self.assertTrue(all([r.sID == 'SIRV7' for r in groups[8]["-"]])) expected_g0_plus_sStart = [10710, 10712, 10712, 10712, 10712, 10712, 10712, 10713, 10713, 10715] expected_g0_plus_sEnd = [11641, 11641, 11638, 11640, 11641, 11641, 11638, 11641, 11640, 11641] self.assertTrue(expected_g0_plus_sStart, [r.sStart for r in groups[0]["+"]]) self.assertTrue(expected_g0_plus_sEnd, [r.sEnd for r in groups[0]["+"]]) expected_g4_minus_sStart = [3640, 3640, 3642, 3642, 3642, 3644] expected_g4_minus_sEnd = [5157, 5157, 5157, 5157, 3829, 5157] self.assertTrue(expected_g0_plus_sStart, [r.sStart for r in groups[4]["-"]]) self.assertTrue(expected_g0_plus_sEnd, [r.sEnd for r in groups[4]["-"]])
def test_Branch(self): """ Test Branch and Branch.run. Note that fuzzy junctions are not merged. """ test_name = "test_branch" good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy") bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy") group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy") rmpath(good_gff_fn) rmpath(bad_gff_fn) rmpath(group_fn) b = Branch(isoform_filename=READS_DS, sam_filename=SORTED_GMAP_SAM, cov_threshold=2, min_aln_coverage=0.99, min_aln_identity=0.95) b.run(allow_extra_5exon=True, skip_5_exon_alt=False, ignored_ids_fn=None, good_gff_fn=good_gff_fn, bad_gff_fn=bad_gff_fn, group_fn=group_fn) self.assertTrue(op.exists(good_gff_fn)) self.assertTrue(op.exists(bad_gff_fn)) self.assertTrue(op.exists(group_fn)) std_good_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".good.gff.unfuzzy") std_bad_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".bad.gff.unfuzzy") std_group_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".group.txt.unfuzzy") print "Comparing %s and %s" % (good_gff_fn, std_good_gff_fn) self.assertTrue(filecmp.cmp(good_gff_fn, std_good_gff_fn)) self.assertTrue(filecmp.cmp(bad_gff_fn, std_bad_gff_fn)) self.assertTrue(filecmp.cmp(group_fn, std_group_fn))
def run_main(chunk_json, sam_output, chunk_key): """run main""" chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key)) sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) log.debug("Chunked SAM files are %s.", (', '.join(sam_files))) log.info("Concatenate chunked SAM files to %s.", sam_output) # concatenate sam files unsorted_sam_output = sam_output + ".unsorted.sam" concatenate_sam(sam_files, unsorted_sam_output) # then sort sort_sam(unsorted_sam_output, sam_output) # remove intermediate file rmpath(unsorted_sam_output) return 0
def test_collapse_sam_records(self): """Test collapse_sam_records, which takes in a list of grouped sam records. and write collapsed gff records to good_gff_writer|bad_gff_writer. A collapsed gff record is 'good' if there are >= cov_threshold supportive sam records belonging to its group; otherwise, 'bad'. """ test_name = "test_collapse_sam_records" good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy") bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy") group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy") rmpath(good_gff_fn) rmpath(bad_gff_fn) rmpath(group_fn) records = _get_sam_groups()[0]["+"] # contains 10 sam records with CollapseGffWriter(good_gff_fn) as good_gff_writer, \ CollapseGffWriter(bad_gff_fn) as bad_gff_writer, \ GroupWriter(group_fn) as group_writer: collapse_sam_records(records=records, cuff_index=0, cov_threshold=2, allow_extra_5exon=False, skip_5_exon_alt=True, good_gff_writer=good_gff_writer, bad_gff_writer=bad_gff_writer, group_writer=group_writer) def str_to_gffrecord(line): fields = line.strip().split('\t') print fields attributes = [] for attr_tuple in fields[8].split(';'): if len(attr_tuple.strip()) == 0: continue else: fs = attr_tuple.strip().split(' ') if len(fs) == 2: attributes.append((fs[0], fs[1].replace('"', ''))) return Gff3Record(seqid=fields[0], start=fields[3], end=fields[4], type=fields[2], attributes=attributes) bad_gff_records = [str_to_gffrecord(line) for line in open(bad_gff_fn, 'r') if not line.startswith('##')] self.assertEqual(len(bad_gff_records), 0) good_gff_records = [str_to_gffrecord(line) for line in open(good_gff_fn, 'r') if not line.startswith('##')] self.assertEqual(len(good_gff_records), 4) self.assertEqual([(int(r.start), int(r.end), r.type, r.attributes['gene_id'], r.attributes['transcript_id']) for r in good_gff_records], [(10711, 11641, 'transcript', "PB.0", "PB.0.1"), (10711, 10791, 'exon', "PB.0", "PB.0.1"), (10883, 11057, 'exon', "PB.0", "PB.0.1"), (11435, 11641, 'exon', "PB.0", "PB.0.1"), ])
def test_map_isoforms_and_sort(self): """Test map_isoforms_and_sort""" out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fasta.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA, sam_filename=out_fn, gmap_db_dir=self.gmap_db_dir, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fastq.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ, sam_filename=out_fn, gmap_db_dir=self.gmap_db_dir, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fasta_ds.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA_DS, sam_filename=out_fn, gmap_db_dir=self.gmap_db_dir, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fastq_ds.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ_DS, sam_filename=out_fn, gmap_db_dir=self.gmap_db_dir, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn))
def test_map_isoforms_and_sort(self): """Test map_isoforms_and_sort""" out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fasta.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA, sam_filename=out_fn, gmap_db_dir=GMAP_DB, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fastq.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ, sam_filename=out_fn, gmap_db_dir=GMAP_DB, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fasta_ds.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA_DS, sam_filename=out_fn, gmap_db_dir=GMAP_DB, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn)) out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fastq_ds.sam') rmpath(out_fn) map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ_DS, sam_filename=out_fn, gmap_db_dir=GMAP_DB, gmap_db_name=GMAP_NAME, gmap_nproc=10) self.assertTrue(op.exists(out_fn))
def test_Branch(self): """ Test Branch and Branch.run. Note that fuzzy junctions are not merged. """ test_name = "test_branch" good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy") bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy") group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy") rmpath(good_gff_fn) rmpath(bad_gff_fn) rmpath(group_fn) b = Branch(isoform_filename=READS_DS, sam_filename=SORTED_GMAP_SAM, cov_threshold=2, min_aln_coverage=0.99, min_aln_identity=0.95) b.run(allow_extra_5exon=True, skip_5_exon_alt=False, ignored_ids_fn=None, good_gff_fn=good_gff_fn, bad_gff_fn=bad_gff_fn, group_fn=group_fn) self.assertTrue(op.exists(good_gff_fn)) self.assertTrue(op.exists(bad_gff_fn)) self.assertTrue(op.exists(group_fn)) std_good_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".good.gff.unfuzzy") std_bad_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".bad.gff.unfuzzy") std_group_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".group.txt.unfuzzy") print "Comparing %s and %s" % (good_gff_fn, std_good_gff_fn) self.assertTrue(filecmp.cmp(good_gff_fn, std_good_gff_fn)) self.assertTrue(filecmp.cmp(bad_gff_fn, std_bad_gff_fn)) self.assertTrue(filecmp.cmp(group_fn, std_group_fn))
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_)
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner( in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) self.gmap_db_dir = op.join(_OUT_DIR_, 'gmap db dir') os.symlink(GMAP_DB, self.gmap_db_dir)
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_)
def args_runner(args): """Run given input args, e.g., filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 --no_filter_subsets """ in_fq, out_fq = args.in_rep_fastq, args.out_rep_fastq def _get_prefix_of_rep_fq(fn): """Return prefix of *.rep.fq""" if fn.endswith(".rep.fastq") or fn.endswith(".rep.fq"): return '.'.join(fn.split(".")[0:-2]) elif fn.endswith(".fastq") or fn.endswith(".fq"): return '.'.join(fn.split(".")[0:-1]) raise ValueError("Invalid collapsed isoforms .rep.fastq file %s" % fn) input_prefix = _get_prefix_of_rep_fq(in_fq) output_prefix = _get_prefix_of_rep_fq(out_fq) # infer group.txt, abundance.txt and gff in_group_filename = input_prefix + ".group.txt" in_abundance_filename = input_prefix + ".abundance.txt" in_gff_filename = input_prefix + ".gff" tmp_out_abundance_filename = output_prefix + ".has_subsets.abundance.txt" tmp_out_gff_filename = output_prefix + ".has_subsets.gff" tmp_out_fq = output_prefix + ".has_subsets.rep.fastq" out_abundance_filename = output_prefix + ".abundance.txt" out_gff_filename = output_prefix + ".gff" # Filter collapsed isoforms by min FL count. logging.info("Filtering collapsed isoforms by count %s", args.min_count) filter_by_count(in_group_filename=in_group_filename, in_abundance_filename=in_abundance_filename, in_gff_filename=in_gff_filename, in_rep_filename=in_fq, out_abundance_filename=tmp_out_abundance_filename, out_gff_filename=tmp_out_gff_filename, out_rep_filename=tmp_out_fq, min_count=args.min_count) # Remove collapsed isoforms which are a subset of another isoform logging.info("Filtering out subsets collapsed isoforms = %s", args.filter_out_subsets) if args.filter_out_subsets is True: filter_out_subsets(in_abundance_filename=tmp_out_abundance_filename, in_gff_filename=tmp_out_gff_filename, in_rep_filename=tmp_out_fq, out_abundance_filename=out_abundance_filename, out_gff_filename=out_gff_filename, out_rep_filename=out_fq, max_fuzzy_junction=args.max_fuzzy_junction) rmpath(tmp_out_abundance_filename) rmpath(tmp_out_gff_filename) rmpath(tmp_out_fq) else: mv(tmp_out_abundance_filename, out_abundance_filename) mv(tmp_out_gff_filename, out_gff_filename) mv(tmp_out_fq, out_fq) logging.info("Filtered collapsed isoforms sequences written to %s", realpath(out_fq)) logging.info("Filtered collapsed isoforms abundance written to %s", realpath(out_abundance_filename)) logging.info("Filtered collapsed isoforms gff written to %s", realpath(out_gff_filename)) return 0
from pbcore.io import FastqReader from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader from pbtranscript.Utils import rmpath, mkdir from pbtranscript.filtering.FilteringUtils import good_isoform_ids_by_count, \ good_isoform_ids_by_removing_subsets, filter_by_count, filter_out_subsets from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR GROUP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.group.txt") ABUNDANCE_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.abundance.txt") GFF_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.gff") REP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.rep.fastq") _OUT_DIR_ = op.join(OUT_DIR, "test_filtering") rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) class TEST_FilteringUtils(unittest.TestCase): """Test functions of pbtranscript.filtering.FilteringUtils.""" def setUp(self): """Define input and output file.""" self.expected_good = ['PB.2.5', 'PB.5.1', 'PB.7.1', 'PB.10.2', 'PB.10.42', 'PB.12.1'] self.expected_diff = ['PB.10.42', 'PB.10.36', 'PB.10.35'] def test_good_isoform_ids_by_count(self): """Test good_isoform_ids_by_count""" good = good_isoform_ids_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN, min_count=20)
"""Test pbtranscript.collapsing.Branch.""" import unittest import os.path as op import cPickle import filecmp import numpy as np from pbtranscript.Utils import rmpath, mkdir from pbtranscript.tasks.map_isoforms_to_genome import gmap_db_and_name_from_ds from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR READS_DS = op.join(SIV_DATA_DIR, 'test_collapsing', 'gmap-input.fastq.contigset.xml') GMAP_DS = op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir/SIRV/gmapreferenceset.xml") _OUT_DIR_ = op.join(OUT_DIR, "test_map_isoforms_to_genome") rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) class TEST_map_isoforms_to_genome(unittest.TestCase): """Test functions of pbtranscript.tasks.map_isoforms_to_genome.""" def setUp(self): """Define input and output file.""" def test_gmap_db_and_name_from_ds(self): """Test map_isoforms_to_genome.gmap_db_and_name_from_ds""" gmap_db, gmap_name = gmap_db_and_name_from_ds(GMAP_DS) self.assertEqual(gmap_db, op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir", "SIRV")) self.assertEqual(gmap_name, "gmap_db")
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, flnc_reads_per_split=args.flnc_reads_per_split, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions( qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files( tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format( split_dir, end_t - start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner(in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def test_collapse_sam_records(self): """Test collapse_sam_records, which takes in a list of grouped sam records. and write collapsed gff records to good_gff_writer|bad_gff_writer. A collapsed gff record is 'good' if there are >= cov_threshold supportive sam records belonging to its group; otherwise, 'bad'. """ test_name = "test_collapse_sam_records" good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy") bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy") group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy") rmpath(good_gff_fn) rmpath(bad_gff_fn) rmpath(group_fn) records = _get_sam_groups()[0]["+"] # contains 10 sam records with CollapseGffWriter(good_gff_fn) as good_gff_writer, \ CollapseGffWriter(bad_gff_fn) as bad_gff_writer, \ GroupWriter(group_fn) as group_writer: collapse_sam_records(records=records, cuff_index=0, cov_threshold=2, allow_extra_5exon=False, skip_5_exon_alt=True, good_gff_writer=good_gff_writer, bad_gff_writer=bad_gff_writer, group_writer=group_writer) def str_to_gffrecord(line): fields = line.strip().split('\t') print fields attributes = [] for attr_tuple in fields[8].split(';'): if len(attr_tuple.strip()) == 0: continue else: fs = attr_tuple.strip().split(' ') if len(fs) == 2: attributes.append((fs[0], fs[1].replace('"', ''))) return Gff3Record(seqid=fields[0], start=fields[3], end=fields[4], type=fields[2], attributes=attributes) bad_gff_records = [ str_to_gffrecord(line) for line in open(bad_gff_fn, 'r') ] self.assertEqual(len(bad_gff_records), 0) good_gff_records = [ str_to_gffrecord(line) for line in open(good_gff_fn, 'r') ] self.assertEqual(len(good_gff_records), 4) self.assertEqual( [(int(r.start), int(r.end), r.type, r.attributes['gene_id'], r.attributes['transcript_id']) for r in good_gff_records], [ (10711, 11641, 'transcript', "PB.0", "PB.0.1"), (10711, 10791, 'exon', "PB.0", "PB.0.1"), (10883, 11057, 'exon', "PB.0", "PB.0.1"), (11435, 11641, 'exon', "PB.0", "PB.0.1"), ])