def combine_nfl_pickles(splitted_pickles, out_pickle): """Combine splitted nfl pickles to a big pickle.""" logging.debug("Cominbing {N} nfl pickles: {ps} ".format( N=len(splitted_pickles), ps=",".join(splitted_pickles)) + " into a big pickle {p}.".format(p=out_pickle)) if len(splitted_pickles) == 1: logging.debug("Copying the only given pickle to out_pickle.") if realpath(splitted_pickles[0]) != realpath(out_pickle): shutil.copyfile(splitted_pickles[0], out_pickle) else: # Combine all partial outputs logging.debug("Merging all pickles.") partial_uc = defaultdict(lambda: []) nohit = set() for pf in splitted_pickles: logging.debug("Merging {pf}.".format(pf=pf)) a = load(open(pf)) nohit.update(a['nohit']) for k, v in a['partial_uc'].iteritems(): partial_uc[k] += v logging.debug("Dumping all to {f}".format(f=out_pickle)) # Dump to one file partial_uc = dict(partial_uc) with open(out_pickle, 'w') as f: dump({'nohit': nohit, 'partial_uc': partial_uc}, f) logging.debug("{f} created.".format(f=out_pickle))
def make_sane(args): """Make sane of input output""" args.smrtlink_job_dir = realpath(args.smrtlink_job_dir) args.out_dir = realpath(args.out_dir) if args.gmap_db is None: args.gmap_db = realpath(GMAP_DB) log.warning("Reset GMAP DB to %s", args.gmap_db) if args.gmap_name is None: args.gmap_name = GMAP_NAME log.warning("Reset GMAP NAME to %s", args.gmap_name) if not op.exists(args.smrtlink_job_dir): raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir) if not op.exists(op.join(args.gmap_db, args.gmap_name)): raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name)) if not op.exists(args.gencode_gtf): raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf) log.info("Making out_dir %s", args.out_dir) mkdir(args.out_dir) return args
def combine_nfl_pickles(splitted_pickles, out_pickle): """Combine splitted nfl pickles to a big pickle.""" logging.debug("Cominbing {N} nfl pickles: {ps} ". format(N=len(splitted_pickles), ps=",".join(splitted_pickles)) + " into a big pickle {p}.".format(p=out_pickle)) if len(splitted_pickles) == 1: logging.debug("Copying the only given pickle to out_pickle.") if realpath(splitted_pickles[0]) != realpath(out_pickle): shutil.copyfile(splitted_pickles[0], out_pickle) else: # Combine all partial outputs logging.debug("Merging all pickles.") partial_uc = defaultdict(lambda: []) nohit = set() for pf in splitted_pickles: logging.debug("Merging {pf}.".format(pf=pf)) a = load(open(pf)) nohit.update(a['nohit']) for k, v in a['partial_uc'].iteritems(): partial_uc[k] += v logging.debug("Dumping all to {f}".format(f=out_pickle)) # Dump to one file partial_uc = dict(partial_uc) with open(out_pickle, 'w') as f: dump({'nohit': nohit, 'partial_uc': partial_uc}, f) logging.debug("{f} created.".format(f=out_pickle))
def from_file(cls, cfg_fn): """read from a config file with SAMPLE=<name>;<path> GROUP_FILENAME= GFF_FILENAME= COUNT_FILENAME= """ sample_names, sample_paths = [], [] group_fn = gff_fn = abundance_fn = None for line in [line.strip() for line in open(realpath(cfg_fn), 'r')]: # read and process if line.startswith('SAMPLE='): name, path = line.strip()[7:].split(';') sample_names.append(name) sample_paths.append(realpath(path)) elif line.startswith('GROUP_FILENAME='): group_fn = line.strip()[len('GROUP_FILENAME='):] elif line.startswith('GFF_FILENAME='): gff_fn = line.strip()[len('GFF_FILENAME='):] elif line.startswith('COUNT_FILENAME='): abundance_fn = line.strip()[len('COUNT_FILENAME='):] try: return ChainConfig(sample_names=sample_names, sample_paths=sample_paths, group_fn=group_fn, gff_fn=gff_fn, abundance_fn=abundance_fn) except ValueError as e: raise ValueError("%s is an invalid ChainConfig file: %s" % (realpath(cfg_fn), str(e)))
def _validate_outputs(self, _root_dir, _out_fa): """Validate outputs, create root_dir if it does not exist.""" self.add_log("Checking outputs.", level=logging.INFO) root_dir, out_fa = _root_dir, _out_fa if root_dir is None: self.add_log("Output directory needs to be specified.", level=logging.ERROR) if out_fa is None: self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR) root_dir = realpath(root_dir) out_fa = realpath(out_fa) if op.exists(root_dir): self.add_log( "Output directory {d} already exists.".format(d=root_dir)) else: self.add_log("Creating output directory {d}.".format(d=root_dir)) os.mkdir(root_dir) if op.exists(out_fa): raise ClusterException( "Consensus FASTA file {f} already exists.".format(f=out_fa)) out_fa_dataset = None if out_fa.endswith(".contigset.xml"): out_fa_dataset = out_fa out_fa = re.sub(".contigset.xml", ".fasta", out_fa) return root_dir, out_fa, out_fa_dataset
def make_sane(args): """Make sane of input output""" args.smrtlink_job_dir = realpath(args.smrtlink_job_dir) args.out_dir = realpath(args.out_dir) if args.gmap_db is None: args.gmap_db = realpath(GMAP_DB) log.warning("Reset GMAP DB to %s", args.gmap_db) if args.gmap_name is None: args.gmap_name = GMAP_NAME log.warning("Reset GMAP NAME to %s", args.gmap_name) if not op.exists(args.smrtlink_job_dir): raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir) if not op.exists(op.join(args.gmap_db, args.gmap_name)): raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name)) if not op.exists(args.gencode_gtf): raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf) log.info("Making out_dir %s", args.out_dir) mkdir(args.out_dir) return args
def _validate_outputs(self, _root_dir, _out_fa): """Validate outputs, create root_dir if it does not exist.""" self.add_log("Checking outputs.", level=logging.INFO) root_dir, out_fa = _root_dir, _out_fa if root_dir is None: self.add_log("Output directory needs to be specified.", level=logging.ERROR) if out_fa is None: self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR) root_dir = realpath(root_dir) out_fa = realpath(out_fa) if op.exists(root_dir): self.add_log("Output directory {d} already exists.". format(d=root_dir)) else: self.add_log("Creating output directory {d}.".format(d=root_dir)) os.mkdir(root_dir) if op.exists(out_fa): raise ClusterException("Consensus FASTA file {f} already exists.". format(f=out_fa)) out_fa_dataset = None if out_fa.endswith(".contigset.xml"): out_fa_dataset = out_fa out_fa = re.sub(".contigset.xml", ".fasta", out_fa) return root_dir, out_fa, out_fa_dataset
def from_file(cls, cfg_fn): """read from a config file with SAMPLE=<name>;<path> GROUP_FILENAME= GFF_FILENAME= COUNT_FILENAME= """ sample_names, sample_paths = [], [] group_fn = gff_fn = abundance_fn = None for line in [line.strip() for line in open(realpath(cfg_fn), 'r')]: # read and process if line.startswith('SAMPLE='): name, path = line.strip()[7:].split(';') sample_names.append(name) sample_paths.append(realpath(path)) elif line.startswith('GROUP_FILENAME='): group_fn = line.strip()[len('GROUP_FILENAME='):] elif line.startswith('GFF_FILENAME='): gff_fn = line.strip()[len('GFF_FILENAME='):] elif line.startswith('COUNT_FILENAME='): abundance_fn = line.strip()[len('COUNT_FILENAME='):] try: return ChainConfig(sample_names=sample_names, sample_paths=sample_paths, group_fn=group_fn, gff_fn=gff_fn, abundance_fn=abundance_fn) except ValueError as e: raise ValueError("%s is an invalid ChainConfig file: %s" % (realpath(cfg_fn), str(e)))
def __init__(self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, ipq_opts, report_fn=None, summary_fn=None, fasta_fofn=None, output_pickle_file=None, tmp_dir=None): super(Cluster, self).__init__(prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.sge_opts = sge_opts # SGE, CPU arguments and etc self.ice_opts = ice_opts # ICE clustering algorithm arguments self.ipq_opts = ipq_opts # IceQuiver HQ/LQ isoform arguments self.output_pickle_file = output_pickle_file self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = \ self._validate_inputs(_flnc_fa=flnc_fa, _nfl_fa=nfl_fa, _ccs_fofn=ccs_fofn, _fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver) self.root_dir, self.out_fa, self.out_fa_dataset = \ self._validate_outputs(root_dir, out_fa) self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.add_log("Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),\ level=logging.INFO) self.report_fn = realpath(report_fn) if report_fn is not None \ else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = realpath(summary_fn) if summary_fn is not None \ else op.join(self.root_dir, "cluster_summary.txt") self.add_log("A Cluster Object created.", level=logging.INFO)
def _validate_inputs(self, fasta_filenames, ref_fasta): """Validate input files.""" for f in fasta_filenames: if not op.exists(f): raise IOError("Input fasta {f} does not exist.".format(f=f)) if ref_fasta is None or not op.exists(ref_fasta): raise IOError("Reference {r} does not exist.".format(r=ref_fasta)) return ([realpath(f) for f in fasta_filenames], realpath(ref_fasta))
def __init__(self, query_filename, target_filename, is_FL, same_strand_only, query_converted=False, target_converted=False, dazz_dir=None, script_dir="scripts/", use_sge=False, sge_opts=None, cpus=24): """ Parameters: query_filename - query FASTA file target_filename - target FASTA file is_FL - whether or not reads are FLNC CCS reads same_strand_only - whether or not align reads in reverse strand query_converted - whether or not query FASTA file been converted to daligner compatible FASTA file. target_converted - whether or not target FASTA file been converted to daligner compatible FASTA file. dazz_dir - if None, all query.dazz.* files will be saved in the same directory as query and all target.dazz.* files will be saved in the same dir as target. if a valid path, all query.dazz.* files and target.dazz.* files will be saved to dazz_dir. script_dir - directory for saving all scripts use_sge - submit daligner jobs to sge or run them locally? sge_opts - sge options cpus - total number of cpus that can be used to align query to target. """ self.query_filename = realpath(query_filename) self.target_filename = realpath(target_filename) self.is_FL = is_FL self.same_strand_only = same_strand_only self.cpus = cpus self.dazz_dir = dazz_dir self.script_dir = realpath(script_dir) self.output_dir = "" self.query_dazz_handler = DazzIDHandler(self.query_filename, converted=query_converted, dazz_dir=dazz_dir) # target may have already been converted (if shared) target_converted = (target_converted or self.query_filename == self.target_filename) self.target_dazz_handler = DazzIDHandler(self.target_filename, converted=target_converted, dazz_dir=dazz_dir) self.target_blocks = self.target_dazz_handler.num_blocks self.query_blocks = self.query_dazz_handler.num_blocks self.use_sge = use_sge self.sge_opts = sge_opts
def _validate_inputs(self, fasta_filenames, ref_fasta): """Validate input files.""" for f in fasta_filenames: if not op.exists(f): raise IOError("Input fasta {f} does not exist.".format(f=f)) if ref_fasta is None or not op.exists(ref_fasta): raise IOError("Reference {r} does not exist.".format(r=ref_fasta)) return ([realpath(f) for f in fasta_filenames], realpath(ref_fasta))
def __init__(self, query_filename, target_filename, is_FL, same_strand_only, query_converted=False, target_converted=False, dazz_dir=None, script_dir="scripts/", use_sge=False, sge_opts=None, cpus=24): """ Parameters: query_filename - query FASTA file target_filename - target FASTA file is_FL - whether or not reads are FLNC CCS reads same_strand_only - whether or not align reads in reverse strand query_converted - whether or not query FASTA file been converted to daligner compatible FASTA file. target_converted - whether or not target FASTA file been converted to daligner compatible FASTA file. dazz_dir - if None, all query.dazz.* files will be saved in the same directory as query and all target.dazz.* files will be saved in the same dir as target. if a valid path, all query.dazz.* files and target.dazz.* files will be saved to dazz_dir. script_dir - directory for saving all scripts use_sge - submit daligner jobs to sge or run them locally? sge_opts - sge options cpus - total number of cpus that can be used to align query to target. """ self.query_filename = realpath(query_filename) self.target_filename = realpath(target_filename) self.is_FL = is_FL self.same_strand_only = same_strand_only self.cpus = cpus self.dazz_dir = dazz_dir self.script_dir = realpath(script_dir) self.output_dir = "" self.query_dazz_handler = DazzIDHandler(self.query_filename, converted=query_converted, dazz_dir=dazz_dir) # target may have already been converted (if shared) target_converted = (target_converted or self.query_filename == self.target_filename) self.target_dazz_handler = DazzIDHandler(self.target_filename, converted=target_converted, dazz_dir=dazz_dir) self.target_blocks = self.target_dazz_handler.num_blocks self.query_blocks = self.query_dazz_handler.num_blocks self.use_sge = use_sge self.sge_opts = sge_opts
def __init__(self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, ipq_opts, report_fn=None, summary_fn=None, fasta_fofn=None, output_pickle_file=None, tmp_dir=None): super(Cluster, self).__init__(prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.sge_opts = sge_opts # SGE, CPU arguments and etc self.ice_opts = ice_opts # ICE clustering algorithm arguments self.ipq_opts = ipq_opts # IceQuiver HQ/LQ isoform arguments self.output_pickle_file = output_pickle_file self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = \ self._validate_inputs(_flnc_fa=flnc_fa, _nfl_fa=nfl_fa, _ccs_fofn=ccs_fofn, _fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver) self.root_dir, self.out_fa, self.out_fa_dataset = \ self._validate_outputs(root_dir, out_fa) self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.add_log("Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),\ level=logging.INFO) self.report_fn = realpath(report_fn) if report_fn is not None \ else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = realpath(summary_fn) if summary_fn is not None \ else op.join(self.root_dir, "cluster_summary.txt") self.add_log("A Cluster Object created.", level=logging.INFO)
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"): """ Run daligner on gcon_in.fa, but don't care about results. Just make sure it runs. """ scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) mkdir(scriptDir) mkdir(testDir) testInFa = op.join(testDir, "daligner.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) runner = DalignerRunner(query_filename=testInFa, target_filename=testInFa, is_FL=True, same_strand_only=True, query_converted=False, target_converted=False, use_sge=False, cpus=4, sge_opts=None) runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False) runner.clean_run() shutil.rmtree(testDir) logging.info("daligner check passed.") return True
def __init__(self, input_filename, converted=False, dazz_dir=None): """ input_filename - input FASTA/FASTQ/ContigSet file converted - whether or not input file has been converted to daligner compatible FASTA file. dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files in the same directory as inputfile. if a valid path, save all output files to dazz_dir. """ self.dazz_dir = dazz_dir self.input_filename = realpath(input_filename) self.validate_file_type(self.input_filename) # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS self.dazz_mapping = {} if converted and not nfs_exists(self.db_filename): log.warning( str(self.input_filename) + " should have been converted to daligner-compatible" + " format, but in fact it is not. Converting ...") converted = False if not converted: self.convert_to_dazz_fasta() self.make_db() else: self.read_dazz_pickle()
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def __init__(self, input_filename, converted=False, dazz_dir=None): """ input_filename - input FASTA/FASTQ/ContigSet file converted - whether or not input file has been converted to daligner compatible FASTA file. dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files in the same directory as inputfile. if a valid path, save all output files to dazz_dir. """ self.dazz_dir = dazz_dir self.input_filename = realpath(input_filename) self.validate_file_type(self.input_filename) # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS self.dazz_mapping = {} if converted and not nfs_exists(self.db_filename): log.warning(str(self.input_filename) + " should have been converted to daligner-compatible" + " format, but in fact it is not. Converting ...") converted = False if not converted: self.convert_to_dazz_fasta() self.make_db() else: self.read_dazz_pickle()
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"): """ Run daligner on gcon_in.fa, but don't care about results. Just make sure it runs. """ scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) mkdir(scriptDir) mkdir(testDir) testInFa = op.join(testDir, "daligner.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) runner = DalignerRunner(query_filename=testInFa, target_filename=testInFa, is_FL=True, same_strand_only=True, query_converted=False, target_converted=False, use_sge=False, cpus=4, sge_opts=None) runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False) runner.clean_run() shutil.rmtree(testDir) logging.info("daligner check passed.") return True
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = [ 'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)), 'ls *.iit *meta', 'sleep 3', 'cd %s' % real_upath(cwd) ] execute(' && '.join(cmd_args)) cmd_args = [ 'gmap', '-D {d}'.format(d=real_upath(gmap_db_dir)), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', '--max-intronlength-ends 200000', # for long genes real_upath(gmap_input_filename), '>', real_upath(unsorted_sam_filename), '2>{log}'.format(log=real_upath(log_filename)) ] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def run(self): """Run""" writer = open(self.output_analysis_fn, 'w') writer.write("#isoseq_output_fn = %s\n" % realpath(self.isoseq_output_fn)) writer.write("#reference_transcripts_fn = %s\n" % realpath(self.reference_transcripts_fn)) writer.write("#total_num_isoforms = %s\n" % self.n_isoforms) writer.write("#total_num_reference_transcripts = %s\n" % self.n_refs) writer.write("#num_true_positive = %s\n" % self.n_true_positive) writer.write("#num_false_positive = %s\n" % self.n_false_positive) for ref in self.reference_transcripts: is_detected = ref.name in self.refs_detected writer.write("%s\t%s\t%s\n" % (ref.name, len(ref.sequence), 'DETECTED' if is_detected else 'MISSED')) writer.close()
def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, _fasta_fofn=None, quiver=False): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa = _flnc_fa, _nfl_fa ccs_fofn, fasta_fofn = _ccs_fofn, _fasta_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None: raise ClusterException( "Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified.") else: flnc_fa = realpath(flnc_fa) if not op.exists(flnc_fa): raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format( fn=flnc_fa)) if nfl_fa is None: if quiver is True: raise ClusterException( "Input non-full-length reads file (i.e., nfl_fa)" + " needs to be specified for isoform polish.") else: nfl_fa = realpath(nfl_fa) if not op.exists(nfl_fa): raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format( fn=nfl_fa)) if ccs_fofn is not None: try: ccs_fofn = validate_fofn(ccs_fofn) except IOError as e: raise ClusterException(str(e)) if fasta_fofn is not None and quiver: try: fasta_fofn = validate_fofn(fasta_fofn) except IOError as e: raise ClusterException(str(e)) return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
def run(self): """Run""" writer = open(self.output_analysis_fn, 'w') writer.write("#isoseq_output_fn = %s\n" % realpath(self.isoseq_output_fn)) writer.write("#reference_transcripts_fn = %s\n" % realpath(self.reference_transcripts_fn)) writer.write("#total_num_isoforms = %s\n" % self.n_isoforms) writer.write("#total_num_reference_transcripts = %s\n" % self.n_refs) writer.write("#num_true_positive = %s\n" % self.n_true_positive) writer.write("#num_false_positive = %s\n" % self.n_false_positive) for ref in self.reference_transcripts: is_detected = ref.name in self.refs_detected writer.write("%s\t%s\t%s\n" % (ref.name, len( ref.sequence), 'DETECTED' if is_detected else 'MISSED')) writer.close()
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name), 'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd] execute(' && '.join(cmd_args)) cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename)] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, _fasta_fofn=None, quiver=False): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa = _flnc_fa, _nfl_fa ccs_fofn, fasta_fofn = _ccs_fofn, _fasta_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None: raise ClusterException("Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified.") else: flnc_fa = realpath(flnc_fa) if not op.exists(flnc_fa): raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa)) if nfl_fa is None: if quiver is True: raise ClusterException("Input non-full-length reads file (i.e., nfl_fa)" + " needs to be specified for isoform polish.") else: nfl_fa = realpath(nfl_fa) if not op.exists(nfl_fa): raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa)) if ccs_fofn is not None: try: ccs_fofn = validate_fofn(ccs_fofn) except IOError as e: raise ClusterException(str(e)) if fasta_fofn is not None and quiver: try: fasta_fofn = validate_fofn(fasta_fofn) except IOError as e: raise ClusterException(str(e)) return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format( self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) cmd = " ".join([ gcon_py, real_upath(testInFa), "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1" ]) write_cmd_to_script(cmd=cmd, script=testSh) assert op.exists(testSh) cmd = sge_opts.qsub_cmd(script=real_upath(testSh), num_threads=1, wait_before_exit=True) logging.debug("Submitting cmd: " + cmd) backticks(cmd) if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) cmd = " ".join([gcon_py, real_upath(testInFa), "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1"]) write_cmd_to_script(cmd=cmd, script=testSh) assert op.exists(testSh) cmd = sge_opts.qsub_cmd(script=real_upath(testSh), num_threads=1, wait_before_exit=True) logging.debug("Submitting cmd: " + cmd) backticks(cmd) if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def __init__(self, flnc_filename, root_dir, out_pickle, output_basename): """ Reads in input flnc file will be separated into multiple categories according to separation criterion, and reads in each category will be written into <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml e.g., if reads are separated by primers, then reads will be written to <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml Parameters: flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET root_dir - output root directory output_basename - output file basename """ self.flnc_filename = flnc_filename self.root_dir = realpath(root_dir) mkdir(root_dir) self.output_basename = output_basename self.create_contigset = True if flnc_filename.endswith(".xml") else False self.handles = {} # key --> fasta file handler self.out_pickle = out_pickle if out_pickle is not None \ else op.join(self.root_dir, "separate_flnc.pickle")
def __init__(self, flnc_filename, root_dir, out_pickle, output_basename): """ Reads in input flnc file will be separated into multiple categories according to separation criterion, and reads in each category will be written into <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml e.g., if reads are separated by primers, then reads will be written to <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml Parameters: flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET root_dir - output root directory output_basename - output file basename """ self.flnc_filename = flnc_filename self.root_dir = realpath(root_dir) mkdir(root_dir) self.output_basename = output_basename self.create_contigset = True if flnc_filename.endswith(".xml") else False self.handles = {} # key --> fasta file handler self.out_pickle = out_pickle if out_pickle is not None \ else op.join(self.root_dir, "separate_flnc.pickle")
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner( in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def __init__(self, reads_fn="test.fasta", out_dir="output/", out_reads_fn="testout.fasta", primer_fn=None, primer_report_fn=None, summary_fn=None, cpus=1, change_read_id=True, opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False), out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False, reuse_dom=False, ignore_empty_output=False): self.reads_fn = realpath(reads_fn) self.out_dir = realpath(out_dir) self.cpus = cpus self.change_read_id = change_read_id self.chimera_detection_opts = opts self.ignore_polyA = ignore_polyA self.reuse_dom = reuse_dom self.ignore_empty_output = ignore_empty_output self._numReads = None # The input primer file: primers.fasta self.primer_fn = primer_fn if primer_fn is not None else \ op.join(self.data_dir, PRIMERFN) # The output fasta file. self.out_all_reads_fn = realpath(out_reads_fn) # Intermediate output fasta file before chimera detection. # trimmed full-length reads: fl.trimmed.fasta # and # trimmed non-full-length reads: nfl.trimmed.fasta self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta") self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta") self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN) self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN) # The output primer file: primer_info.csv self.primer_report_fn = primer_report_fn \ if primer_report_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN # primer reports for nfl reads before chimera detection. Note that # chimera detection is not necessary for nfl reads. self._primer_report_nfl_fn = op.join(self.out_dir, "primer_report.nfl.csv") # primer reports for fl reads after chimera detection. Note that # chimera detection is required for fl reads. self._primer_report_fl_fn = op.join(self.out_dir, "primer_report.fl.csv") # The matrix file: PBMATRIX.txt self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN) # The output phmmer Dom file for trimming primers: hmmer.front_end.dom self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN) # The output phmmer Dom file for chimera detection: # hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN) self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN) self.chunked_front_back_reads_fns = None self.chunked_front_back_dom_fns = None #self.chunked_trimmed_reads_fns = None #self.chunked_trimmed_reads_dom_fns = None # The summary file: *.classify_summary.txt self.summary = ClassifySummary() self.summary_fn = summary_fn if summary_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + \ "." + CLASSIFYSUMMARY self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \ else op.join(self.out_dir, "nfl.fasta") self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta") self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta") self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \ else op.join(self.out_dir, "flnc.fasta") self.out_flc_fn = op.join(self.out_dir, "flc.fasta") for file_attr in [ "out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn", "out_flc_fn", "out_all_reads_fn" ]: file_name = fasta_file_name = getattr(self, file_attr) if file_name.endswith(".xml"): fasta_file_name = ".".join( file_name.split(".")[:-2]) + ".fasta" setattr(self, "%s_fasta" % file_attr, fasta_file_name)
def __init__(self, combined_dir): self.combined_dir = realpath(combined_dir) mkdir(self.combined_dir)
def post_mapping_to_genome_runner( in_isoforms, in_sam, in_pickle, out_isoforms, out_gff, out_abundance, out_group, out_read_stat, min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT, min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT, min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT, max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT, allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT, skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT, min_count=fci.Constants.MIN_COUNT_DEFAULT, to_filter_out_subsets=True): """ (1) Collapse isoforms and merge fuzzy junctions if needed. (2) Generate read stat file and abundance file (3) Based on abundance file, filter collapsed isoforms by min FL count """ log.info('args: {!r}'.format(locals())) # Check input and output format in_suffix = parse_ds_filename(in_isoforms)[1] out_prefix, out_suffix = parse_ds_filename(out_isoforms) if in_suffix != out_suffix: raise ValueError( "Format of input and output isoforms %s, %s must be the same." % (in_isoforms, out_isoforms)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input and output isoforms %s, %s must be FASTA or FASTQ." % (in_isoforms, out_isoforms)) #(1) Collapse isoforms and merge fuzzy junctions if needed. cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon) cir = CollapseIsoformsRunner(isoform_filename=in_isoforms, sam_filename=in_sam, output_prefix=out_prefix, min_aln_coverage=min_aln_coverage, min_aln_identity=min_aln_identity, min_flnc_coverage=min_flnc_coverage, max_fuzzy_junction=max_fuzzy_junction, allow_extra_5exon=allow_extra_5exon, skip_5_exon_alt=skip_5_exon_alt) cir.run() # (2) Generate read stat file and abundance file cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle, output_read_stat_filename=cf.read_stat_fn, output_abundance_filename=cf.abundance_fn) cr.run() # (3) Filter collapsed isoforms by min FL count based on abundance file. fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=False) filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn, in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix), out_abundance_filename=fff.filtered_abundance_fn, out_gff_filename=fff.filtered_gff_fn, out_rep_filename=fff.filtered_rep_fn(out_suffix), min_count=min_count) fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=True) # (4) Remove collapsed isoforms which are a subset of another isoform if to_filter_out_subsets is True: filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn, in_gff_filename=fff.filtered_gff_fn, in_rep_filename=fff.filtered_rep_fn(out_suffix), out_abundance_filename=fft.filtered_abundance_fn, out_gff_filename=fft.filtered_gff_fn, out_rep_filename=fft.filtered_rep_fn(out_suffix), max_fuzzy_junction=max_fuzzy_junction) fff = fft # (5) ln outputs files ln_pairs = [ (fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms (fff.filtered_gff_fn, out_gff), # gff annotation (fff.filtered_abundance_fn, out_abundance), # abundance info (fff.group_fn, out_group), # groups (fff.read_stat_fn, out_read_stat) ] # read stat info for src, dst in ln_pairs: if dst is not None: ln(src, dst) logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s", min_count, filter_out_subsets) logging.info( "Collapsed and filtered isoform sequences written to %s", realpath(out_isoforms) if out_isoforms is not None else realpath( fff.filtered_rep_fn(out_suffix))) logging.info( "Collapsed and filtered isoform annotations written to %s", realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn)) logging.info( "Collapsed and filtered isoform abundance info written to %s", realpath(out_abundance) if out_abundance is not None else realpath(fff.filtered_abundance_fn)) logging.info( "Collapsed isoform groups written to %s", realpath(out_group) if out_group is not None else realpath(fff.group_fn)) logging.info( "Read status of FL and nFL reads written to %s", realpath(out_read_stat) if out_read_stat is not None else realpath(fff.read_stat_fn))
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts = IceOptions() ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn, input_fasta, time.time() - start_t) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting %s + %s --> %s", input_fasta, ccs_fofn, input_fastq) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from %s took %s secs", input_fastq, time.time() - start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() hitItems = daligner_against_ref( query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time() - start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using DALIGNER, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() # not providing full_missed_start/end since aligning nFLs, ok to partially align only hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, same_strand_only=True, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time()-start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=cpus) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) logging.info("Calling blasr_against_ref ...") # no need to provide full_missed_start/end for nFLs, since is_FL = False hitItems = blasr_against_ref2(output_filename=m5_file, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def __init__(self, root_dir): self.root_dir = realpath(root_dir)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts = IceOptions() ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn, input_fasta, time.time()-start_t) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting %s + %s --> %s", input_fasta, ccs_fofn, input_fastq) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from %s took %s secs", input_fastq, time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() hitItems = daligner_against_ref(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time()-start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def run(self, output_dir='.', min_match_len=300, sensitive_mode=False): """ if self.use_sge --- writes to <scripts>/daligner_job_#.sh else --- run locally, dividing into self.cpus/4 tasks (capped max at 4) NOTE 1: when using SGE, be careful that multiple calls to this might end up writing to the SAME job.sh files, this should be avoided by changing <scripts> directory NOTE 2: more commonly this should be invoked locally (since ice_partial.py i/one be qsub-ed), in that case it is more recommended to keep self.cpus = 4 so that each daligner job is run consecutively and that the original qsub job should have been called with qsub -pe smp 4 (set by --blasr_nproc 4) In this way, the daligner jobs are called consecutively, but LA4Ice is parallelized 4X """ self.output_dir = realpath(output_dir) # Reset output_dir old_dir = realpath(op.curdir) mkdir(output_dir) os.chdir(output_dir) if self.use_sge: mknewdir(self.script_dir) # prepare done scripts is no longer necessary. #self.write_daligner_done_script() #self.write_la4ice_done_script() # (a) run all daligner jobs daligner_cmds = self.daligner_cmds(min_match_len=min_match_len, sensitive_mode=sensitive_mode) logging.info("Start daligner cmds " + ("using sge." if self.use_sge else "locally.")) logging.debug("CMD: " + "\n".join(daligner_cmds)) start_t = time.time() failed = [] if self.use_sge: failed.extend( sge_job_runner(cmds_list=daligner_cmds, script_files=self.daligner_scripts, #done_script=self.daligner_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=daligner_cmds, num_threads=max(1, min(self.cpus/4, 4)))) logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.") # (b) run all LA4Ice jobs start_t = time.time() logging.info("Start LA4Ice cmds " + ("using sge." if self.use_sge else "locally.")) la4ice_cmds = self.la4ice_cmds logging.debug("CMD: " + "\n".join(la4ice_cmds)) if self.use_sge: failed.extend( sge_job_runner(cmds_list=la4ice_cmds, script_files=self.la4ice_scripts, #done_script=self.la4ice_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=la4ice_cmds, num_threads=max(1, min(self.cpus, 4)))) logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.") os.chdir(old_dir) if len(failed) == 0: return 0 else: raise RuntimeError("%s.run failed, %s." % (op.basename(self.__class__), "\n".join([x[0] for x in failed])))
def args_runner(args): """Run given input args, e.g., filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 --no_filter_subsets """ in_fq, out_fq = args.in_rep_fastq, args.out_rep_fastq def _get_prefix_of_rep_fq(fn): """Return prefix of *.rep.fq""" if fn.endswith(".rep.fastq") or fn.endswith(".rep.fq"): return '.'.join(fn.split(".")[0:-2]) elif fn.endswith(".fastq") or fn.endswith(".fq"): return '.'.join(fn.split(".")[0:-1]) raise ValueError("Invalid collapsed isoforms .rep.fastq file %s" % fn) input_prefix = _get_prefix_of_rep_fq(in_fq) output_prefix = _get_prefix_of_rep_fq(out_fq) # infer group.txt, abundance.txt and gff in_group_filename = input_prefix + ".group.txt" in_abundance_filename = input_prefix + ".abundance.txt" in_gff_filename = input_prefix + ".gff" tmp_out_abundance_filename = output_prefix + ".has_subsets.abundance.txt" tmp_out_gff_filename = output_prefix + ".has_subsets.gff" tmp_out_fq = output_prefix + ".has_subsets.rep.fastq" out_abundance_filename = output_prefix + ".abundance.txt" out_gff_filename = output_prefix + ".gff" # Filter collapsed isoforms by min FL count. logging.info("Filtering collapsed isoforms by count %s", args.min_count) filter_by_count(in_group_filename=in_group_filename, in_abundance_filename=in_abundance_filename, in_gff_filename=in_gff_filename, in_rep_filename=in_fq, out_abundance_filename=tmp_out_abundance_filename, out_gff_filename=tmp_out_gff_filename, out_rep_filename=tmp_out_fq, min_count=args.min_count) # Remove collapsed isoforms which are a subset of another isoform logging.info("Filtering out subsets collapsed isoforms = %s", args.filter_out_subsets) if args.filter_out_subsets is True: filter_out_subsets(in_abundance_filename=tmp_out_abundance_filename, in_gff_filename=tmp_out_gff_filename, in_rep_filename=tmp_out_fq, out_abundance_filename=out_abundance_filename, out_gff_filename=out_gff_filename, out_rep_filename=out_fq, max_fuzzy_junction=args.max_fuzzy_junction) rmpath(tmp_out_abundance_filename) rmpath(tmp_out_gff_filename) rmpath(tmp_out_fq) else: mv(tmp_out_abundance_filename, out_abundance_filename) mv(tmp_out_gff_filename, out_gff_filename) mv(tmp_out_fq, out_fq) logging.info("Filtered collapsed isoforms sequences written to %s", realpath(out_fq)) logging.info("Filtered collapsed isoforms abundance written to %s", realpath(out_abundance_filename)) logging.info("Filtered collapsed isoforms gff written to %s", realpath(out_gff_filename)) return 0
def run(self): """ First, collapse input isoforms by calling Branch.run(). Then collapse fuzzy junctions by calling collapse_fuzzy_junctions. Finally, pick up representitive gff record for each group of collapsed isoforms. """ self.validate_inputs() logging.info("Collapsing isoforms into transcripts.") b = Branch(isoform_filename=self.isoform_filename, sam_filename=self.sam_filename, cov_threshold=self.min_flnc_coverage, min_aln_coverage=self.min_aln_coverage, min_aln_identity=self.min_aln_identity) b.run(allow_extra_5exon=self.allow_extra_5exon, skip_5_exon_alt=self.skip_5_exon_alt, ignored_ids_fn=self.ignored_ids_txt_fn, good_gff_fn=self.good_unfuzzy_gff_fn, bad_gff_fn=self.bad_unfuzzy_gff_fn, group_fn=self.unfuzzy_group_fn) logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn)) logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn)) logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn)) if self.shall_collapse_fuzzy_junctions: logging.info("Further collapsing fuzzy junctions.") # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions( gff_filename=self.good_unfuzzy_gff_fn, group_filename=self.unfuzzy_group_fn, fuzzy_gff_filename=self.good_fuzzy_gff_fn, fuzzy_group_filename=self.fuzzy_group_fn, allow_extra_5exon=self.allow_extra_5exon, max_fuzzy_junction=self.max_fuzzy_junction) logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn)) logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn)) logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn)) ln(self.good_fuzzy_gff_fn, self.good_gff_fn) ln(self.good_fuzzy_gff_fn, self.gff_fn) ln(self.fuzzy_group_fn, self.group_fn) else: logging.info("No need to further collapse fuzzy junctions.") ln(self.good_unfuzzy_gff_fn, self.good_gff_fn) ln(self.good_unfuzzy_gff_fn, self.gff_fn) ln(self.unfuzzy_group_fn, self.group_fn) # Pick up representative logging.info("Picking up representative record.") pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest pick_rep(isoform_filename=self.isoform_filename, gff_filename=self.good_gff_fn, group_filename=self.group_fn, output_filename=self.rep_fn(self.suffix), pick_least_err_instead=pick_least_err_instead, bad_gff_filename=self.bad_gff_fn) logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn)) logging.info("Output GFF written to: %s", realpath(self.gff_fn)) logging.info("Output Group TXT written to: %s", realpath(self.group_fn)) logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix))) logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
def __init__(self, root_dir): self.root_dir = realpath(root_dir)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def __init__(self, combined_dir): self.combined_dir = realpath(combined_dir) mkdir(self.combined_dir)
def g(d, base_fn): """Convert file basename to abs file path""" return op.join(realpath(d), base_fn)
def run(self, output_dir='.', min_match_len=300, sensitive_mode=False): """ if self.use_sge --- writes to <scripts>/daligner_job_#.sh else --- run locally, dividing into self.cpus/4 tasks (capped max at 4) NOTE 1: when using SGE, be careful that multiple calls to this might end up writing to the SAME job.sh files, this should be avoided by changing <scripts> directory NOTE 2: more commonly this should be invoked locally (since ice_partial.py i/one be qsub-ed), in that case it is more recommended to keep self.cpus = 4 so that each daligner job is run consecutively and that the original qsub job should have been called with qsub -pe smp 4 (set by --blasr_nproc 4) In this way, the daligner jobs are called consecutively, but LA4Ice is parallelized 4X """ self.output_dir = realpath(output_dir) # Reset output_dir old_dir = realpath(op.curdir) mkdir(output_dir) os.chdir(output_dir) if self.use_sge: mknewdir(self.script_dir) # prepare done scripts is no longer necessary. #self.write_daligner_done_script() #self.write_la4ice_done_script() # (a) run all daligner jobs daligner_cmds = self.daligner_cmds(min_match_len=min_match_len, sensitive_mode=sensitive_mode) logging.info("Start daligner cmds " + ("using sge." if self.use_sge else "locally.")) logging.debug("CMD: " + "\n".join(daligner_cmds)) start_t = time.time() failed = [] if self.use_sge: failed.extend( sge_job_runner(cmds_list=daligner_cmds, script_files=self.daligner_scripts, #done_script=self.daligner_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=daligner_cmds, num_threads=max(1, min(self.cpus/4, 4)))) logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.") # (b) run all LA4Ice jobs start_t = time.time() logging.info("Start LA4Ice cmds " + ("using sge." if self.use_sge else "locally.")) la4ice_cmds = self.la4ice_cmds logging.debug("CMD: " + "\n".join(la4ice_cmds)) if self.use_sge: failed.extend( sge_job_runner(cmds_list=la4ice_cmds, script_files=self.la4ice_scripts, #done_script=self.la4ice_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=la4ice_cmds, num_threads=max(1, min(self.cpus, 4)))) logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.") os.chdir(old_dir) if len(failed) == 0: return 0 else: raise RuntimeError("%s.run failed, %s." % (op.basename(self.__class__), "\n".join([x[0] for x in failed])))
def post_mapping_to_genome_runner(in_isoforms, in_sam, in_pickle, out_isoforms, out_gff, out_abundance, out_group, out_read_stat, min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT, min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT, min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT, max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT, allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT, skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT, min_count=fci.Constants.MIN_COUNT_DEFAULT, to_filter_out_subsets=True): """ (1) Collapse isoforms and merge fuzzy junctions if needed. (2) Generate read stat file and abundance file (3) Based on abundance file, filter collapsed isoforms by min FL count """ # Check input and output format in_suffix = parse_ds_filename(in_isoforms)[1] out_prefix, out_suffix = parse_ds_filename(out_isoforms) if in_suffix != out_suffix: raise ValueError("Format of input and output isoforms %s, %s must be the same." % (in_isoforms, out_isoforms)) if in_suffix not in ("fasta", "fastq"): raise ValueError("Format of input and output isoforms %s, %s must be FASTA or FASTQ." % (in_isoforms, out_isoforms)) #(1) Collapse isoforms and merge fuzzy junctions if needed. cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon) cir = CollapseIsoformsRunner(isoform_filename=in_isoforms, sam_filename=in_sam, output_prefix=out_prefix, min_aln_coverage=min_aln_coverage, min_aln_identity=min_aln_identity, min_flnc_coverage=min_flnc_coverage, max_fuzzy_junction=max_fuzzy_junction, allow_extra_5exon=allow_extra_5exon, skip_5_exon_alt=skip_5_exon_alt) cir.run() # (2) Generate read stat file and abundance file cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle, output_read_stat_filename=cf.read_stat_fn, output_abundance_filename=cf.abundance_fn) cr.run() # (3) Filter collapsed isoforms by min FL count based on abundance file. fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=False) filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn, in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix), out_abundance_filename=fff.filtered_abundance_fn, out_gff_filename=fff.filtered_gff_fn, out_rep_filename=fff.filtered_rep_fn(out_suffix), min_count=min_count) fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=True) # (4) Remove collapsed isoforms which are a subset of another isoform if to_filter_out_subsets is True: filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn, in_gff_filename=fff.filtered_gff_fn, in_rep_filename=fff.filtered_rep_fn(out_suffix), out_abundance_filename=fft.filtered_abundance_fn, out_gff_filename=fft.filtered_gff_fn, out_rep_filename=fft.filtered_rep_fn(out_suffix), max_fuzzy_junction=max_fuzzy_junction) fff = fft # (5) ln outputs files ln_pairs = [(fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms (fff.filtered_gff_fn, out_gff), # gff annotation (fff.filtered_abundance_fn, out_abundance), # abundance info (fff.group_fn, out_group), # groups (fff.read_stat_fn, out_read_stat)] # read stat info for src, dst in ln_pairs: if dst is not None: ln(src, dst) logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s", min_count, filter_out_subsets) logging.info("Collapsed and filtered isoform sequences written to %s", realpath(out_isoforms) if out_isoforms is not None else realpath(fff.filtered_rep_fn(out_suffix))) logging.info("Collapsed and filtered isoform annotations written to %s", realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn)) logging.info("Collapsed and filtered isoform abundance info written to %s", realpath(out_abundance) if out_abundance is not None else realpath(fff.filtered_abundance_fn)) logging.info("Collapsed isoform groups written to %s", realpath(out_group) if out_group is not None else realpath(fff.group_fn)) logging.info("Read status of FL and nFL reads written to %s", realpath(out_read_stat) if out_read_stat is not None else realpath(fff.read_stat_fn))
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, flnc_reads_per_split=args.flnc_reads_per_split, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions( qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files( tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format( split_dir, end_t - start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner(in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def __init__(self, reads_fn="test.fasta", out_dir="output/", out_reads_fn="testout.fasta", primer_fn=None, primer_report_fn=None, summary_fn=None, cpus=1, change_read_id=True, opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False), out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False, reuse_dom=False, ignore_empty_output=False): self.reads_fn = realpath(reads_fn) self.out_dir = realpath(out_dir) self.cpus = cpus self.change_read_id = change_read_id self.chimera_detection_opts = opts self.ignore_polyA = ignore_polyA self.reuse_dom = reuse_dom self.ignore_empty_output = ignore_empty_output self._numReads = None # The input primer file: primers.fasta self.primer_fn = primer_fn if primer_fn is not None else \ op.join(self.data_dir, PRIMERFN) # The output fasta file. self.out_all_reads_fn = realpath(out_reads_fn) # Intermediate output fasta file before chimera detection. # trimmed full-length reads: fl.trimmed.fasta # and # trimmed non-full-length reads: nfl.trimmed.fasta self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta") self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta") self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN) self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN) # The output primer file: primer_info.csv self.primer_report_fn = primer_report_fn \ if primer_report_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN # primer reports for nfl reads before chimera detection. Note that # chimera detection is not necessary for nfl reads. self._primer_report_nfl_fn = op.join(self.out_dir, "primer_report.nfl.csv") # primer reports for fl reads after chimera detection. Note that # chimera detection is required for fl reads. self._primer_report_fl_fn = op.join(self.out_dir, "primer_report.fl.csv") # The matrix file: PBMATRIX.txt self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN) # The output phmmer Dom file for trimming primers: hmmer.front_end.dom self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN) # The output phmmer Dom file for chimera detection: # hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN) self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN) self.chunked_front_back_reads_fns = None self.chunked_front_back_dom_fns = None #self.chunked_trimmed_reads_fns = None #self.chunked_trimmed_reads_dom_fns = None # The summary file: *.classify_summary.txt self.summary = ClassifySummary() self.summary_fn = summary_fn if summary_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + \ "." + CLASSIFYSUMMARY self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \ else op.join(self.out_dir, "nfl.fasta") self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta") self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta") self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \ else op.join(self.out_dir, "flnc.fasta") self.out_flc_fn = op.join(self.out_dir, "flc.fasta") for file_attr in ["out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn", "out_flc_fn", "out_all_reads_fn"]: file_name = fasta_file_name = getattr(self, file_attr) if file_name.endswith(".xml"): fasta_file_name = ".".join(file_name.split(".")[:-2])+".fasta" setattr(self, "%s_fasta" % file_attr, fasta_file_name)
def g(d, base_fn): """Convert file basename to abs file path""" return op.join(realpath(d), base_fn)
def run(self): """ First, collapse input isoforms by calling Branch.run(). Then collapse fuzzy junctions by calling collapse_fuzzy_junctions. Finally, pick up representitive gff record for each group of collapsed isoforms. """ self.validate_inputs() logging.info("Collapsing isoforms into transcripts.") b = Branch(isoform_filename=self.isoform_filename, sam_filename=self.sam_filename, cov_threshold=self.min_flnc_coverage, min_aln_coverage=self.min_aln_coverage, min_aln_identity=self.min_aln_identity) b.run(allow_extra_5exon=self.allow_extra_5exon, skip_5_exon_alt=self.skip_5_exon_alt, ignored_ids_fn=self.ignored_ids_txt_fn, good_gff_fn=self.good_unfuzzy_gff_fn, bad_gff_fn=self.bad_unfuzzy_gff_fn, group_fn=self.unfuzzy_group_fn) logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn)) logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn)) logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn)) if self.shall_collapse_fuzzy_junctions: logging.info("Further collapsing fuzzy junctions.") # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions(gff_filename=self.good_unfuzzy_gff_fn, group_filename=self.unfuzzy_group_fn, fuzzy_gff_filename=self.good_fuzzy_gff_fn, fuzzy_group_filename=self.fuzzy_group_fn, allow_extra_5exon=self.allow_extra_5exon, max_fuzzy_junction=self.max_fuzzy_junction) logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn)) logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn)) logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn)) ln(self.good_fuzzy_gff_fn, self.good_gff_fn) ln(self.good_fuzzy_gff_fn, self.gff_fn) ln(self.fuzzy_group_fn, self.group_fn) else: logging.info("No need to further collapse fuzzy junctions.") ln(self.good_unfuzzy_gff_fn, self.good_gff_fn) ln(self.good_unfuzzy_gff_fn, self.gff_fn) ln(self.unfuzzy_group_fn, self.group_fn) # Pick up representative logging.info("Picking up representative record.") pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest pick_rep(isoform_filename=self.isoform_filename, gff_filename=self.good_gff_fn, group_filename=self.group_fn, output_filename=self.rep_fn(self.suffix), pick_least_err_instead=pick_least_err_instead, bad_gff_filename=self.bad_gff_fn) logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn)) logging.info("Output GFF written to: %s", realpath(self.gff_fn)) logging.info("Output Group TXT written to: %s", realpath(self.group_fn)) logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix))) logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)