def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix): self.input_fasta = input_fasta self.out_dir = out_dir self.reads_per_split = reads_per_split # Number of reads per split self.out_prefix = out_prefix self.out_fns = None mkdir(self.out_dir)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def run(self): """Run quiver for ICE.""" # Create directories: root_dir/quivered and root_dir/log_dir/quivered mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) files = get_files_from_fofn(self.fasta_fofn) msg = "Indexing {0} fasta files, please wait.".format(len(files)) self.add_log(msg) d = MetaSubreadFastaReader(files) self.add_log("Fasta files indexing done.") self.add_log("Loading uc from {f}.".format(f=self.final_pickle_fn)) a = load(open(self.final_pickle_fn)) uc = a['uc'] refs = a['refs'] self.add_log("Loading partial uc from {f}.". format(f=self.nfl_all_pickle_fn)) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) # Write report to quivered/cluster_report.FL_nonFL.csv self.add_log("Writing a csv report of cluster -> FL/NonFL reads to {f}". format(f=self.report_fn), level=logging.INFO) self.write_report(uc=uc, partial_uc=partial_uc2, report_fn=self.report_fn) good = [x for x in uc] #[x for x in uc if len(uc[x]) > 1 or len(partial_uc2[x]) >= 10] keys = sorted(good) # sort good keys (cluster ids) start = 0 end = len(keys) submitted = [] # submitted jobs todo = [] # to-do jobs self.submit_quiver_jobs(d=d, uc=uc, partial_uc=partial_uc2, refs=refs, keys=keys, start=start, end=end, submitted=submitted, todo=todo, use_sge=self.sge_opts.use_sge, max_sge_jobs=self.sge_opts.max_sge_jobs, quiver_nproc=self.sge_opts.quiver_nproc) with open(self.submitted_quiver_jobs_log, 'w') as f: f.write("\n".join(str(x[0]) + '\t' + str(x[1]) for x in submitted)) self.close_log() return 0
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify bas_fofn (e.g. input.fofn)." elif not nfs_exists(self.bas_fofn): errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \ "which contains bas/bax.h5 files does not exist." elif self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def __init__(self, root_dir, fasta_filenames, ref_fasta, out_pickle, sge_opts, sa_file=None, ccs_fofn=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of gcon jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. gcon_nproc : number of gcon that can run at the same time """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir) self.fasta_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \ self._validateInputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn, sa_file=sa_file) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle)
def __init__(self, prog_name, root_dir, bas_fofn=None, ccs_fofn=None, fasta_fofn=None, no_log_f=False): """ prog_name --- name of a sub-class root_dir --- root directory of the whole project. There will be sub-directories under it, including: tmp/ --- 0/ c0, c1, ..., c9999 --- 1/ c10000, c10001, ..., c19999 ... each c? folder contains data for a cluster id=c? script/ --- 0/ gcon_job_?.sh, gcon jobs in the first iteration --- 1/ gcon_job_?.sh, gcon jobs in the second iteration ... log/ --- ICE.log Log of the ICE algorithm --- 0/ log for jobs in the first iteration ... output/ output files go here. bas_fofn --- input.fofn which contains movie.bas|bax.h5 files. ccs_fofn --- a fofn contains movie.ccs.h5 files. fasta_fofn --- a fofn contains movie.bax.h5.fasta files. script/ no_log_f --- DON'T write log to a log file. """ self.prog_name = str(prog_name) self.root_dir = real_ppath(root_dir) self.bas_fofn = real_ppath(bas_fofn) self.ccs_fofn = real_ppath(ccs_fofn) self.fasta_fofn = real_ppath(fasta_fofn) mkdir(self.root_dir) mkdir(self.tmp_dir) mkdir(self.log_dir) mkdir(self.script_dir) mkdir(self.out_dir) self.no_log_f = no_log_f if not no_log_f: self.log_f = open(self.log_fn, 'w', 0) self.add_log(msg="Initializing {p}.".format(p=self.prog_name))
def __init__(self, root_dir, fastq_filenames, ref_fasta, out_pickle, sge_opts, sa_file=None, ccs_fofn=None): """ fastq_filenames --- a list of splitted nfl fastq files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir) self.add_log("DEBUG: in IceAllPartials, ccs_fofn is {0}.".format(ccs_fofn), level=logging.INFO) self.fastq_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \ self._validate_inputs(fastq_filenames=fastq_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn, sa_file=sa_file) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fastq_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" logging.debug("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) trim_subread_flanks(tmp_out_file, out_file) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" logging.debug("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) trim_subread_flanks(tmp_out_file, out_file) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() in_queue = manager.Queue(len(in_fns)) in_queue_count = 0 outfile_track = {} # expected out file --> (cmd, tmp) pool = [] out_fns = [] for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) in_queue_count += 1 outfile_track[out_file] = (cmd, tmp_out_file) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,)) pool.append(p) #error_flag = False # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join(timeout=1200) if p.is_alive(): p.terminate() # check that all files exists # if it does not, force to run locally for out_file,(cmd, tmp_out_file) in outfile_track.iteritems(): in_queue.put((cmd, tmp_out_file, out_file)) convert_fofn_to_fasta_worker(in_queue) out_fns.append(out_file) #if error_flag: # raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!" write_files_to_fofn(out_fns, out_filename)
def run(self): """Run quiver for ICE.""" # Create directories: root_dir/quivered and root_dir/log_dir/quivered mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) files = get_files_from_fofn(self.fasta_fofn) msg = "Indexing {0} fasta files, please wait.".format(len(files)) self.add_log(msg) d = MetaSubreadFastaReader(files) self.add_log("Fasta files indexing done.") self.add_log("Loading uc from {f}.".format(f=self.final_pickle_fn)) a = load(open(self.final_pickle_fn)) uc = a['uc'] refs = a['refs'] self.add_log( "Loading partial uc from {f}.".format(f=self.nfl_all_pickle_fn)) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) # Write report to quivered/cluster_report.FL_nonFL.csv self.add_log( "Writing a csv report of cluster -> FL/NonFL reads to {f}".format( f=self.report_fn), level=logging.INFO) self.write_report(uc=uc, partial_uc=partial_uc2, report_fn=self.report_fn) good = [ x for x in uc ] #[x for x in uc if len(uc[x]) > 1 or len(partial_uc2[x]) >= 10] keys = sorted(good) # sort good keys (cluster ids) start = 0 end = len(keys) submitted = [] # submitted jobs todo = [] # to-do jobs self.submit_quiver_jobs(d=d, uc=uc, partial_uc=partial_uc2, refs=refs, keys=keys, start=start, end=end, submitted=submitted, todo=todo, use_sge=self.sge_opts.use_sge, max_sge_jobs=self.sge_opts.max_sge_jobs, quiver_nproc=self.sge_opts.quiver_nproc) with open(self.submitted_quiver_jobs_log, 'w') as f: f.write("\n".join(str(x[0]) + '\t' + str(x[1]) for x in submitted)) self.close_log() return 0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() out_fns = manager.list() in_queue = manager.Queue(99999) pool = [] for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue, out_fns)) pool.append(p) for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file # logging.debug("CMD: {cmd}".format(cmd=cmd)) # _out, _code, _msg = backticks(cmd) # if _code != 0: # raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) # trim_subread_flanks(tmp_out_file, out_file) # out_fns.append(out_file) # if op.exists(tmp_out_file): # os.remove(tmp_out_file) # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join() write_files_to_fofn(out_fns, out_filename)