def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def test_runner(self): """Test CombineRunner.""" ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30) d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks") split_dirs = [op.join(d, b, "cluster_out") for b in ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")] print split_dirs out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir") rmpath(out_combined_dir) mkdir(out_combined_dir) obj = CombineRunner(combined_dir=out_combined_dir, sample_name="mysample", split_dirs=split_dirs, ipq_opts=ipq_opts) obj.run() expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq, obj.all_consensus_isoforms_fa, obj.all_cluster_report_fn, obj.all_cluster_summary_fn) self.assertTrue(all([op.exists(f) for f in expected_out_fns])) expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470', 'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457', 'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471'] self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms) self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms) expected_lq_isoforms_num = 73 self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num) expected_consensus_isoforms_num = 79 self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix): self.input_fasta = input_fasta self.out_dir = out_dir self.reads_per_split = reads_per_split # Number of reads per split self.out_prefix = out_prefix self.out_fns = None mkdir(self.out_dir)
def make_sane(args): """Make sane of input output""" args.smrtlink_job_dir = realpath(args.smrtlink_job_dir) args.out_dir = realpath(args.out_dir) if args.gmap_db is None: args.gmap_db = realpath(GMAP_DB) log.warning("Reset GMAP DB to %s", args.gmap_db) if args.gmap_name is None: args.gmap_name = GMAP_NAME log.warning("Reset GMAP NAME to %s", args.gmap_name) if not op.exists(args.smrtlink_job_dir): raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir) if not op.exists(op.join(args.gmap_db, args.gmap_name)): raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name)) if not op.exists(args.gencode_gtf): raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf) log.info("Making out_dir %s", args.out_dir) mkdir(args.out_dir) return args
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def make_cluster_out_dir(in_dir, root_dir): bin_name = op.basename(op.dirname(in_dir)) new_dir = op.join( root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out mkdir(new_dir) return new_dir
def __init__(self, root_dir, fasta_filenames, fastq_filenames, ref_fasta, out_pickle, ice_opts, sge_opts, cpus, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta) if fastq_filenames is not None: for fq in fastq_filenames: assert op.exists(fq) self.fastq_filenames = fastq_filenames # note: could be None self.out_pickle = out_pickle self.ice_opts = ice_opts self.sge_opts = sge_opts self.cpus = cpus # this is the number of CPUs to use per SGE job or per local job self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20] refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") self.add_log("Reconstructing g consensus files for clusters " "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir), level=logging.INFO) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in final_consensus_d.d.keys(): cid = int(ref_id.split('/')[0].replace('c', '')) # e.g., ref_id = c103/1/3708, cid = 103, # refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta if cid in cids: mkdir(self.cluster_dir(cid)) ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"): """ Run daligner on gcon_in.fa, but don't care about results. Just make sure it runs. """ scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) mkdir(scriptDir) mkdir(testDir) testInFa = op.join(testDir, "daligner.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) runner = DalignerRunner(query_filename=testInFa, target_filename=testInFa, is_FL=True, same_strand_only=True, query_converted=False, target_converted=False, use_sge=False, cpus=4, sge_opts=None) runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False) runner.clean_run() shutil.rmtree(testDir) logging.info("daligner check passed.") return True
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix): self.input_fasta = input_fasta self.out_dir = out_dir self.reads_per_split = reads_per_split # Number of reads per split self.out_prefix = out_prefix self.out_fns = None mkdir(self.out_dir)
def make_sane(args): """Make sane of input output""" args.smrtlink_job_dir = realpath(args.smrtlink_job_dir) args.out_dir = realpath(args.out_dir) if args.gmap_db is None: args.gmap_db = realpath(GMAP_DB) log.warning("Reset GMAP DB to %s", args.gmap_db) if args.gmap_name is None: args.gmap_name = GMAP_NAME log.warning("Reset GMAP NAME to %s", args.gmap_name) if not op.exists(args.smrtlink_job_dir): raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir) if not op.exists(op.join(args.gmap_db, args.gmap_name)): raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name)) if not op.exists(args.gencode_gtf): raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf) log.info("Making out_dir %s", args.out_dir) mkdir(args.out_dir) return args
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20] refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") self.add_log("Reconstructing g consensus files for clusters " "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir), level=logging.INFO) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in final_consensus_d.d.keys(): cid = int(ref_id.split('/')[0].replace('c', '')) # e.g., ref_id = c103/1/3708, cid = 103, # refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta if cid in cids: mkdir(self.cluster_dir(cid)) ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if N <= 0 or N > 100: errMsg = "Input file can not be splitted into %d chunks!" % N if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"): """ Run daligner on gcon_in.fa, but don't care about results. Just make sure it runs. """ scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) mkdir(scriptDir) mkdir(testDir) testInFa = op.join(testDir, "daligner.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) runner = DalignerRunner(query_filename=testInFa, target_filename=testInFa, is_FL=True, same_strand_only=True, query_converted=False, target_converted=False, use_sge=False, cpus=4, sge_opts=None) runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False) runner.clean_run() shutil.rmtree(testDir) logging.info("daligner check passed.") return True
def __init__(self, input_fa_or_fq, reads_per_split, out_dir, out_format, is_fq): self.input_fa_or_fq = input_fa_or_fq self.is_fq = is_fq self.out_dir = out_dir self.reads_per_split = reads_per_split # Number of reads per split self.out_format = out_format self.out_fns = None mkdir(self.out_dir)
def __enter__(self): # make a sub dir for each separation criteria for d in self.out_dirs: mkdir(d) # open all fasta file handlers for index, key in enumerate(self.sorted_keys): self.handles[key] = open(self.out_fasta_files[index], 'w') return self
def __enter__(self): # make a sub dir for each separation criteria for d in self.out_dirs: mkdir(d) # open all fasta file handlers for index, key in enumerate(self.sorted_keys): self.handles[key] = open(self.out_fasta_files[index], 'w') return self
def __init__(self, root_dir, fasta_filenames, ref_fasta, out_pickle, sge_opts, ccs_fofn=None, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format( f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def __init__(self, prog_name, root_dir, bas_fofn=None, ccs_fofn=None, fasta_fofn=None, no_log_f=False, tmp_dir=None, make_dirs=True): """ prog_name --- name of a sub-class root_dir --- root directory of the whole project. There will be sub-directories under it, including: tmp/ --- 0/ c0, c1, ..., c9999 --- 1/ c10000, c10001, ..., c19999 ... each c? folder contains data for a cluster id=c? script/ --- 0/ gcon_job_?.sh, gcon jobs in the first iteration --- 1/ gcon_job_?.sh, gcon jobs in the second iteration ... log/ --- ICE.log Log of the ICE algorithm --- 0/ log for jobs in the first iteration ... output/ output files go here. bas_fofn --- input.fofn which contains movie.bas|bax.h5 files. ccs_fofn --- a fofn contains movie.ccs.h5 files. fasta_fofn --- a fofn contains movie.bax.h5.fasta files. script/ no_log_f --- DON'T write log to a log file. tmp_dir --- Write temporary files to tmp_dir (usually /scratch) for speed """ self.prog_name = str(prog_name) self.root_dir = real_ppath(root_dir) self._tmp_dir = real_ppath(tmp_dir) self.bas_fofn = real_ppath(bas_fofn) self.ccs_fofn = real_ppath(ccs_fofn) self.fasta_fofn = real_ppath(fasta_fofn) if make_dirs is True: mkdir(self.root_dir) mkdir(self.tmp_dir) mkdir(self.log_dir) mkdir(self.script_dir) mkdir(self.out_dir) self.no_log_f = no_log_f if not no_log_f: self.log_f = open(self.log_fn, 'w', 0) self.add_log(msg="{p} initialized.".format(p=self.prog_name))
def __init__(self, root_dir, subread_set, nproc): tmp_dir = op.join(root_dir, "tmp") mkdir(tmp_dir) super(IceQuiverRTC, self).__init__(root_dir=root_dir, bas_fofn=subread_set, fasta_fofn=None, sge_opts=SgeOptions(unique_id=12345, use_sge=False, max_sge_jobs=0, blasr_nproc=nproc, quiver_nproc=nproc), prog_name="IceQuiver")
def __init__(self, root_dir, fasta_filenames, fastq_filenames, ref_fasta, out_pickle, ice_opts, sge_opts, cpus, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta) if fastq_filenames is not None: for fq in fastq_filenames: assert op.exists(fq) self.fastq_filenames = fastq_filenames # note: could be None self.out_pickle = out_pickle self.ice_opts = ice_opts self.sge_opts = sge_opts self.cpus = cpus # this is the number of CPUs to use per SGE job or per local job self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle): """Copy task.files to new_task.files.""" if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files: shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file) copied_files[new_task.consensus_isoforms_file] = True if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files: mkdir(op.dirname(new_task.flnc_pickle)) shutil.copy(task.flnc_pickle, new_task.flnc_pickle) copied_files[new_task.flnc_pickle] = True if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files: mkdir(op.dirname(new_task.nfl_pickle)) shutil.copy(task.nfl_pickle, new_task.nfl_pickle) copied_files[new_task.nfl_pickle] = True
def __init__(self, root_dir, subread_set, nproc): tmp_dir = op.join(root_dir, "tmp") mkdir(tmp_dir) super(IceQuiverRTC, self).__init__( root_dir=root_dir, bas_fofn=subread_set, fasta_fofn=None, sge_opts=SgeOptions( unique_id=12345, use_sge=False, max_sge_jobs=0, blasr_nproc=nproc, quiver_nproc=nproc), prog_name="IceQuiver")
def resolved_tool_contract_to_args(resolved_tool_contract): """Convert resolved tool contract to args.""" rtc = resolved_tool_contract args = [ "--verbose", "classify", resolved_tool_contract.task.input_files[0], resolved_tool_contract.task.output_files[0], "--flnc", resolved_tool_contract.task.output_files[1], "--nfl", resolved_tool_contract.task.output_files[2], "--summary", resolved_tool_contract.task.output_files[3], # JSON "--report", resolved_tool_contract.task.output_files[4], # CSV "--min_seq_len", str(rtc.task.options[Constants.MIN_SEQ_LEN_ID]), "--cpus", str(resolved_tool_contract.task.nproc), "--outDir", op.dirname(rtc.task.output_files[0]), "--ignore-empty-output", ] if rtc.task.options[Constants.IGNORE_POLYA_ID]: args.append("--ignore_polyA") primers_str_obj = rtc.task.options[Constants.PRIMER_SEQUENCES_ID] primers_str = str(primers_str_obj).strip().translate(None, '\'\" ') if primers_str_obj is not None and primers_str not in ('None', ''): logging.info("Detected customer primer: %s", primers_str) # Save primer sequences to a fasta file under output dir primer_fasta_records = parse_primer_sequences(primers_str=primers_str) d = op.dirname(resolved_tool_contract.task.output_files[2]) mkdir(d) primer_fn = op.join(d, "customer_primers.fasta") with FastaWriter(primer_fn) as writer: for record in primer_fasta_records: writer.writeRecord(record) logging.info("Customer primer sequences written to file %s", primer_fn) args.append("-p") args.append("%s" % primer_fn) else: logging.info("No customer primer detected.") return get_argument_parser().parse_args(args)
def __init__(self, prog_name, root_dir, bas_fofn=None, ccs_fofn=None, fasta_fofn=None, no_log_f=False, tmp_dir=None, make_dirs=True): """ prog_name --- name of a sub-class root_dir --- root directory of the whole project. There will be sub-directories under it, including: tmp/ --- 0/ c0, c1, ..., c9999 --- 1/ c10000, c10001, ..., c19999 ... each c? folder contains data for a cluster id=c? script/ --- 0/ gcon_job_?.sh, gcon jobs in the first iteration --- 1/ gcon_job_?.sh, gcon jobs in the second iteration ... log/ --- ICE.log Log of the ICE algorithm --- 0/ log for jobs in the first iteration ... output/ output files go here. bas_fofn --- input.fofn which contains movie.bas|bax.h5 files. ccs_fofn --- a fofn contains movie.ccs.h5 files. fasta_fofn --- a fofn contains movie.bax.h5.fasta files. script/ no_log_f --- DON'T write log to a log file. tmp_dir --- Write temporary files to tmp_dir (usually /scratch) for speed """ self.prog_name = str(prog_name) self.root_dir = real_ppath(root_dir) self._tmp_dir = real_ppath(tmp_dir) self.bas_fofn = real_ppath(bas_fofn) self.ccs_fofn = real_ppath(ccs_fofn) self.fasta_fofn = real_ppath(fasta_fofn) if make_dirs is True: mkdir(self.root_dir) mkdir(self.tmp_dir) mkdir(self.log_dir) mkdir(self.script_dir) mkdir(self.out_dir) self.no_log_f = no_log_f if not no_log_f: self.log_f = open(self.log_fn, 'w', 0) self.add_log(msg="{p} initialized.".format(p=self.prog_name))
def setUp(self): """Initialize.""" self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref") self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script") self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz") self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out") mkdir (self.dazz_dir) mkdir (self.out_dir) self.stdout_dir = STD_DIR self.sivDataDir = SIV_DATA_DIR self.query_filename = "test_daligner_query.fasta" self.target_filename = "test_daligner_target.fasta" self.runner = DalignerRunner(query_filename=op.join(self.data_dir, self.query_filename), target_filename=op.join(self.data_dir, self.target_filename), is_FL=False, same_strand_only=True, dazz_dir=self.dazz_dir, script_dir=self.script_dir) self.runner.output_dir = self.out_dir
def __init__(self, root_dir, fasta_filenames, ref_fasta, out_pickle, sge_opts, ccs_fofn=None, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle): """Copy task.files to new_task.files.""" if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files: shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file) copied_files[new_task.consensus_isoforms_file] = True if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files: mkdir(op.dirname(new_task.flnc_pickle)) shutil.copy(task.flnc_pickle, new_task.flnc_pickle) copied_files[new_task.flnc_pickle] = True if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files: mkdir(op.dirname(new_task.nfl_pickle)) shutil.copy(task.nfl_pickle, new_task.nfl_pickle) copied_files[new_task.nfl_pickle] = True
def run(self): """ Call DalignerRunner """ logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=self.getVersion())) args = self.args mkdir(args.output_dir) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) obj = DalignerRunner(query_filename=args.query_fasta, target_filename=args.target_fasta, is_FL=args.is_FL, same_strand_only=args.same_strand_only, query_converted=False, target_converted=False, use_sge=args.use_sge, sge_opts=sge_opts) obj.run(output_dir=args.output_dir)
def run(self): """ Call DalignerRunner """ logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=self.getVersion())) args = self.args mkdir(args.output_dir) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) obj = DalignerRunner(query_filename=args.query_fasta, target_filename=args.target_fasta, is_FL=args.is_FL, same_strand_only=args.same_strand_only, query_converted=False, target_converted=False, use_sge=args.use_sge, sge_opts=sge_opts) obj.run(output_dir=args.output_dir)
def setUp(self): """Initialize.""" self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref") self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script") self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz") self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out") mkdir(self.dazz_dir) mkdir(self.out_dir) self.stdout_dir = STD_DIR self.sivDataDir = SIV_DATA_DIR self.query_filename = "test_daligner_query.fasta" self.target_filename = "test_daligner_target.fasta" self.runner = DalignerRunner( query_filename=op.join(self.data_dir, self.query_filename), target_filename=op.join(self.data_dir, self.target_filename), is_FL=False, same_strand_only=True, dazz_dir=self.dazz_dir, script_dir=self.script_dir) self.runner.output_dir = self.out_dir
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create arrowed_dir and arrowed_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.arrowed_dir) mkdir(self.arrowed_log_dir) except OSError: # Multiple ice_arrow_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.subread_xml is None: errMsg = "Please specify subreads XML (e.g., --subread_xml=<movie>.subreadset.xml)." elif not nfs_exists(self.subread_xml): errMsg = "Specified subreads file (subread_xml={f}) does not exist.".format( f=self.subread_xml) elif guess_file_format(self.subread_xml) is not FILE_FORMATS.BAM: errMsg = "Invalid subreads XML file: {0}!".format(self.subread_xml) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'run_IcePartials2.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def __init__(self, flnc_filename, root_dir, out_pickle, output_basename): """ Reads in input flnc file will be separated into multiple categories according to separation criterion, and reads in each category will be written into <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml e.g., if reads are separated by primers, then reads will be written to <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml Parameters: flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET root_dir - output root directory output_basename - output file basename """ self.flnc_filename = flnc_filename self.root_dir = realpath(root_dir) mkdir(root_dir) self.output_basename = output_basename self.create_contigset = True if flnc_filename.endswith(".xml") else False self.handles = {} # key --> fasta file handler self.out_pickle = out_pickle if out_pickle is not None \ else op.join(self.root_dir, "separate_flnc.pickle")
def __init__(self, flnc_filename, root_dir, out_pickle, output_basename): """ Reads in input flnc file will be separated into multiple categories according to separation criterion, and reads in each category will be written into <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml e.g., if reads are separated by primers, then reads will be written to <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml Parameters: flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET root_dir - output root directory output_basename - output file basename """ self.flnc_filename = flnc_filename self.root_dir = realpath(root_dir) mkdir(root_dir) self.output_basename = output_basename self.create_contigset = True if flnc_filename.endswith(".xml") else False self.handles = {} # key --> fasta file handler self.out_pickle = out_pickle if out_pickle is not None \ else op.join(self.root_dir, "separate_flnc.pickle")
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) Liz: new cids after ice2 collection is b<bin>_c<cid> refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") print("Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) self.add_log( "Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in list(final_consensus_d.d.keys()): # Liz: this is no longer valid for the Ice2 cids #cid = int(ref_id.split('/')[0].replace('c', '')) cid = ref_id if cid in cids: _dir = self.cluster_dir_for_reconstructed_ref(cid) mkdir(_dir) ref_fa = op.join(_dir, op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def run(self, output_dir='.', min_match_len=300, sensitive_mode=False): """ if self.use_sge --- writes to <scripts>/daligner_job_#.sh else --- run locally, dividing into self.cpus/4 tasks (capped max at 4) NOTE 1: when using SGE, be careful that multiple calls to this might end up writing to the SAME job.sh files, this should be avoided by changing <scripts> directory NOTE 2: more commonly this should be invoked locally (since ice_partial.py i/one be qsub-ed), in that case it is more recommended to keep self.cpus = 4 so that each daligner job is run consecutively and that the original qsub job should have been called with qsub -pe smp 4 (set by --blasr_nproc 4) In this way, the daligner jobs are called consecutively, but LA4Ice is parallelized 4X """ self.output_dir = realpath(output_dir) # Reset output_dir old_dir = realpath(op.curdir) mkdir(output_dir) os.chdir(output_dir) if self.use_sge: mknewdir(self.script_dir) # prepare done scripts is no longer necessary. #self.write_daligner_done_script() #self.write_la4ice_done_script() # (a) run all daligner jobs daligner_cmds = self.daligner_cmds(min_match_len=min_match_len, sensitive_mode=sensitive_mode) logging.info("Start daligner cmds " + ("using sge." if self.use_sge else "locally.")) logging.debug("CMD: " + "\n".join(daligner_cmds)) start_t = time.time() failed = [] if self.use_sge: failed.extend( sge_job_runner(cmds_list=daligner_cmds, script_files=self.daligner_scripts, #done_script=self.daligner_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=daligner_cmds, num_threads=max(1, min(self.cpus/4, 4)))) logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.") # (b) run all LA4Ice jobs start_t = time.time() logging.info("Start LA4Ice cmds " + ("using sge." if self.use_sge else "locally.")) la4ice_cmds = self.la4ice_cmds logging.debug("CMD: " + "\n".join(la4ice_cmds)) if self.use_sge: failed.extend( sge_job_runner(cmds_list=la4ice_cmds, script_files=self.la4ice_scripts, #done_script=self.la4ice_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=la4ice_cmds, num_threads=max(1, min(self.cpus, 4)))) logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.") os.chdir(old_dir) if len(failed) == 0: return 0 else: raise RuntimeError("%s.run failed, %s." % (op.basename(self.__class__), "\n".join([x[0] for x in failed])))
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner( in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def __init__(self, combined_dir): self.combined_dir = realpath(combined_dir) mkdir(self.combined_dir)
def run(self, output_dir='.', min_match_len=300, sensitive_mode=False): """ if self.use_sge --- writes to <scripts>/daligner_job_#.sh else --- run locally, dividing into self.cpus/4 tasks (capped max at 4) NOTE 1: when using SGE, be careful that multiple calls to this might end up writing to the SAME job.sh files, this should be avoided by changing <scripts> directory NOTE 2: more commonly this should be invoked locally (since ice_partial.py i/one be qsub-ed), in that case it is more recommended to keep self.cpus = 4 so that each daligner job is run consecutively and that the original qsub job should have been called with qsub -pe smp 4 (set by --blasr_nproc 4) In this way, the daligner jobs are called consecutively, but LA4Ice is parallelized 4X """ self.output_dir = realpath(output_dir) # Reset output_dir old_dir = realpath(op.curdir) mkdir(output_dir) os.chdir(output_dir) if self.use_sge: mknewdir(self.script_dir) # prepare done scripts is no longer necessary. #self.write_daligner_done_script() #self.write_la4ice_done_script() # (a) run all daligner jobs daligner_cmds = self.daligner_cmds(min_match_len=min_match_len, sensitive_mode=sensitive_mode) logging.info("Start daligner cmds " + ("using sge." if self.use_sge else "locally.")) logging.debug("CMD: " + "\n".join(daligner_cmds)) start_t = time.time() failed = [] if self.use_sge: failed.extend( sge_job_runner(cmds_list=daligner_cmds, script_files=self.daligner_scripts, #done_script=self.daligner_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=daligner_cmds, num_threads=max(1, min(self.cpus/4, 4)))) logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.") # (b) run all LA4Ice jobs start_t = time.time() logging.info("Start LA4Ice cmds " + ("using sge." if self.use_sge else "locally.")) la4ice_cmds = self.la4ice_cmds logging.debug("CMD: " + "\n".join(la4ice_cmds)) if self.use_sge: failed.extend( sge_job_runner(cmds_list=la4ice_cmds, script_files=self.la4ice_scripts, #done_script=self.la4ice_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=la4ice_cmds, num_threads=max(1, min(self.cpus, 4)))) logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.") os.chdir(old_dir) if len(failed) == 0: return 0 else: raise RuntimeError("%s.run failed, %s." % (op.basename(self.__class__), "\n".join([x[0] for x in failed])))
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions( qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name( input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace( ".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms( split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files. all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report( split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def make_pickle(in_pickle, out_pickle, root_dir, copy_consensus_isoforms=False, copy_flnc_pickle=False, copy_nfl_pickle=False, copy_quivered=False): """ Copy cluster_out_dir in in_pickle to {root_dir}/bin_name/cluster_out/ """ mkdir(root_dir) def make_flnc(in_flnc, root_dir): bin_name = op.basename(op.dirname(in_flnc)) flnc_name = op.basename(in_flnc) assert in_flnc.endswith(".contigset.xml") in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta") new_flnc = op.join(root_dir, bin_name, flnc_name) new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta") print "new_flnc = %s" % new_flnc shutil.copy(in_flnc_fa, new_flnc_fa) as_contigset(new_flnc_fa, new_flnc) def make_cluster_out_dir(in_dir, root_dir): bin_name = op.basename(op.dirname(in_dir)) new_dir = op.join(root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out mkdir(new_dir) return new_dir def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle): """Copy task.files to new_task.files.""" if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files: shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file) copied_files[new_task.consensus_isoforms_file] = True if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files: mkdir(op.dirname(new_task.flnc_pickle)) shutil.copy(task.flnc_pickle, new_task.flnc_pickle) copied_files[new_task.flnc_pickle] = True if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files: mkdir(op.dirname(new_task.nfl_pickle)) shutil.copy(task.nfl_pickle, new_task.nfl_pickle) copied_files[new_task.nfl_pickle] = True print "making pickle from in_pickle %s to out_pickle %s, root_dir %s" % \ (in_pickle, out_pickle, root_dir) p = ChunkTasksPickle.read(in_pickle) assert len(p) > 0 if all([isinstance(task, ClusterChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = ClusterChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir) _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) outp.append(new_task) outp.write(out_pickle) elif all([isinstance(task, PartialChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = PartialChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir, nfl_file=task.nfl_file, nfl_index=task.nfl_index, n_nfl_chunks=task.n_nfl_chunks) _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) outp.append(new_task) outp.write(out_pickle) elif all([isinstance(task, PolishChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = PolishChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir, polish_index=task.polish_index, n_polish_chunks=task.n_polish_chunks) mkdir(op.dirname(new_task.nfl_pickle)) # always copy nfl_pickle for PolishChunkTask assert copy_nfl_pickle is True _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) dst_dir =op.join(cluster_out_dir, "quivered") if copy_quivered is True and dst_dir not in copied_files: if op.exists(dst_dir): shutil.rmtree(dst_dir) shutil.copytree(op.join(task.cluster_out_dir, "quivered"), dst_dir) copied_files[dst_dir] = True outp.append(new_task) outp.write(out_pickle) else: assert False
def make_cluster_out_dir(in_dir, root_dir): bin_name = op.basename(op.dirname(in_dir)) new_dir = op.join(root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out mkdir(new_dir) return new_dir
"""Test pbtranscript.collapsing.Branch.""" import unittest import os.path as op import cPickle import filecmp import numpy as np from pbtranscript.Utils import rmpath, mkdir from pbtranscript.tasks.map_isoforms_to_genome import gmap_db_and_name_from_ds from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR READS_DS = op.join(SIV_DATA_DIR, 'test_collapsing', 'gmap-input.fastq.contigset.xml') GMAP_DS = op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir/SIRV/gmapreferenceset.xml") _OUT_DIR_ = op.join(OUT_DIR, "test_map_isoforms_to_genome") rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) class TEST_map_isoforms_to_genome(unittest.TestCase): """Test functions of pbtranscript.tasks.map_isoforms_to_genome.""" def setUp(self): """Define input and output file.""" def test_gmap_db_and_name_from_ds(self): """Test map_isoforms_to_genome.gmap_db_and_name_from_ds""" gmap_db, gmap_name = gmap_db_and_name_from_ds(GMAP_DS) self.assertEqual(gmap_db, op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir", "SIRV")) self.assertEqual(gmap_name, "gmap_db")
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) self.gmap_db_dir = op.join(_OUT_DIR_, 'gmap db dir') os.symlink(GMAP_DB, self.gmap_db_dir)
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_)
def run(self): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=self.getVersion())) args = self.args # Get cluster bins directories as input cluster_bin_dirs = self.get_cluster_bin_dirs(separate_flnc_pickle=args.separate_flnc_pickle, cluster_bin_dirs=args.cluster_bin_dirs) cluster_bin_indices = range(0, len(cluster_bin_dirs)) # Create output dir combined_dir = args.combined_dir mkdir(combined_dir) # Get combined output filenames def f(input_fn, default_fn): if input_fn is None: return op.join(combined_dir, default_fn) out_consensus_isoforms_fa = f(args.consensus_isoforms_fa, "all.consensus_isoforms.fasta") out_summary = f(args.summary_fn, "all.cluster_summary.json") out_report = f(args.report_fn, "all.cluster_report.csv") out_hq_fa = f(args.hq_isoforms_fa, "all.polished_hq.fasta") out_lq_fa = f(args.lq_isoforms_fa, "all.polished_lq.fasta") out_hq_fq = f(args.hq_isoforms_fq, "all.polished_hq.fastq") out_lq_fq = f(args.lq_isoforms_fq, "all.polished_lq.fastq") ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) sample_name = get_sample_name(input_sample_name=args.sample_name) hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] for cluster_bin_dir in cluster_bin_dirs: ice_pq = IceQuiverPostprocess(root_dir=cluster_bin_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_fa, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def setUp(self): """Define input and output file.""" rmpath(_OUT_DIR_) mkdir(_OUT_DIR_)
def __init__(self, combined_dir): self.combined_dir = realpath(combined_dir) mkdir(self.combined_dir)
def cluster_dir(self, cid): """"overwrite IceQuiver.cluster_dir""" dir_name = IceQuiver.cluster_dir(self, cid) mkdir(dir_name) return dir_name
def make_pickle(in_pickle, out_pickle, root_dir, copy_consensus_isoforms=False, copy_flnc_pickle=False, copy_nfl_pickle=False, copy_quivered=False): """ Copy cluster_out_dir in in_pickle to {root_dir}/bin_name/cluster_out/ """ mkdir(root_dir) def make_flnc(in_flnc, root_dir): bin_name = op.basename(op.dirname(in_flnc)) flnc_name = op.basename(in_flnc) assert in_flnc.endswith(".contigset.xml") in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta") new_flnc = op.join(root_dir, bin_name, flnc_name) new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta") print "new_flnc = %s" % new_flnc shutil.copy(in_flnc_fa, new_flnc_fa) as_contigset(new_flnc_fa, new_flnc) def make_cluster_out_dir(in_dir, root_dir): bin_name = op.basename(op.dirname(in_dir)) new_dir = op.join( root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out mkdir(new_dir) return new_dir def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle): """Copy task.files to new_task.files.""" if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files: shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file) copied_files[new_task.consensus_isoforms_file] = True if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files: mkdir(op.dirname(new_task.flnc_pickle)) shutil.copy(task.flnc_pickle, new_task.flnc_pickle) copied_files[new_task.flnc_pickle] = True if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files: mkdir(op.dirname(new_task.nfl_pickle)) shutil.copy(task.nfl_pickle, new_task.nfl_pickle) copied_files[new_task.nfl_pickle] = True print "making pickle from in_pickle %s to out_pickle %s, root_dir %s" % \ (in_pickle, out_pickle, root_dir) p = ChunkTasksPickle.read(in_pickle) assert len(p) > 0 if all([isinstance(task, ClusterChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = ClusterChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir) _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) outp.append(new_task) outp.write(out_pickle) elif all([isinstance(task, PartialChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = PartialChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir, nfl_file=task.nfl_file, nfl_index=task.nfl_index, n_nfl_chunks=task.n_nfl_chunks) _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) outp.append(new_task) outp.write(out_pickle) elif all([isinstance(task, PolishChunkTask) for task in p]): outp = ChunkTasksPickle() copied_files = dict() for task in p: cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir) print "new_cluster_out_dir is %s" % cluster_out_dir #flnc_file = make_flnc(task.flnc_file) new_task = PolishChunkTask(task.cluster_bin_index, task.flnc_file, cluster_out_dir, polish_index=task.polish_index, n_polish_chunks=task.n_polish_chunks) mkdir(op.dirname(new_task.nfl_pickle)) # always copy nfl_pickle for PolishChunkTask assert copy_nfl_pickle is True _cp(task=task, new_task=new_task, copied_files=copied_files, copy_consensus_isoforms=copy_consensus_isoforms, copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle) dst_dir = op.join(cluster_out_dir, "quivered") if copy_quivered is True and dst_dir not in copied_files: if op.exists(dst_dir): shutil.rmtree(dst_dir) shutil.copytree(op.join(task.cluster_out_dir, "quivered"), dst_dir) copied_files[dst_dir] = True outp.append(new_task) outp.write(out_pickle) else: assert False
from pbcore.io import FastqReader from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader from pbtranscript.Utils import rmpath, mkdir from pbtranscript.filtering.FilteringUtils import good_isoform_ids_by_count, \ good_isoform_ids_by_removing_subsets, filter_by_count, filter_out_subsets from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR GROUP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.group.txt") ABUNDANCE_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.abundance.txt") GFF_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.gff") REP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.rep.fastq") _OUT_DIR_ = op.join(OUT_DIR, "test_filtering") rmpath(_OUT_DIR_) mkdir(_OUT_DIR_) class TEST_FilteringUtils(unittest.TestCase): """Test functions of pbtranscript.filtering.FilteringUtils.""" def setUp(self): """Define input and output file.""" self.expected_good = ['PB.2.5', 'PB.5.1', 'PB.7.1', 'PB.10.2', 'PB.10.42', 'PB.12.1'] self.expected_diff = ['PB.10.42', 'PB.10.36', 'PB.10.35'] def test_good_isoform_ids_by_count(self): """Test good_isoform_ids_by_count""" good = good_isoform_ids_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN, min_count=20) self.assertEqual(good, self.expected_good)
def cluster_dir(self, cid): """"overwrite IceQuiver.cluster_dir""" dir_name = IceQuiver.cluster_dir(self, cid) mkdir(dir_name) return dir_name