Esempio n. 1
0
def combine_nfl_pickles(splitted_pickles, out_pickle):
    """Combine splitted nfl pickles to a big pickle."""
    logging.debug("Cominbing {N} nfl pickles: {ps} ".format(
        N=len(splitted_pickles), ps=",".join(splitted_pickles)) +
                  " into a big pickle {p}.".format(p=out_pickle))

    if len(splitted_pickles) == 1:
        logging.debug("Copying the only given pickle to out_pickle.")
        if realpath(splitted_pickles[0]) != realpath(out_pickle):
            shutil.copyfile(splitted_pickles[0], out_pickle)
    else:
        # Combine all partial outputs
        logging.debug("Merging all pickles.")
        partial_uc = defaultdict(lambda: [])
        nohit = set()
        for pf in splitted_pickles:
            logging.debug("Merging {pf}.".format(pf=pf))
            a = load(open(pf))
            nohit.update(a['nohit'])
            for k, v in a['partial_uc'].iteritems():
                partial_uc[k] += v

        logging.debug("Dumping all to {f}".format(f=out_pickle))
        # Dump to one file
        partial_uc = dict(partial_uc)
        with open(out_pickle, 'w') as f:
            dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
        logging.debug("{f} created.".format(f=out_pickle))
def make_sane(args):
    """Make sane of input output"""
    args.smrtlink_job_dir = realpath(args.smrtlink_job_dir)
    args.out_dir = realpath(args.out_dir)

    if args.gmap_db is None:
        args.gmap_db = realpath(GMAP_DB)
        log.warning("Reset GMAP DB to %s", args.gmap_db)

    if args.gmap_name is None:
        args.gmap_name = GMAP_NAME
        log.warning("Reset GMAP NAME to %s", args.gmap_name)

    if not op.exists(args.smrtlink_job_dir):
        raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir)

    if not op.exists(op.join(args.gmap_db, args.gmap_name)):
        raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name))

    if not op.exists(args.gencode_gtf):
        raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf)

    log.info("Making out_dir %s", args.out_dir)
    mkdir(args.out_dir)
    return args
Esempio n. 3
0
def combine_nfl_pickles(splitted_pickles, out_pickle):
    """Combine splitted nfl pickles to a big pickle."""
    logging.debug("Cominbing {N} nfl pickles: {ps} ".
                  format(N=len(splitted_pickles),
                         ps=",".join(splitted_pickles)) +
                  " into a big pickle {p}.".format(p=out_pickle))

    if len(splitted_pickles) == 1:
        logging.debug("Copying the only given pickle to out_pickle.")
        if realpath(splitted_pickles[0]) != realpath(out_pickle):
            shutil.copyfile(splitted_pickles[0], out_pickle)
    else:
        # Combine all partial outputs
        logging.debug("Merging all pickles.")
        partial_uc = defaultdict(lambda: [])
        nohit = set()
        for pf in splitted_pickles:
            logging.debug("Merging {pf}.".format(pf=pf))
            a = load(open(pf))
            nohit.update(a['nohit'])
            for k, v in a['partial_uc'].iteritems():
                partial_uc[k] += v

        logging.debug("Dumping all to {f}".format(f=out_pickle))
        # Dump to one file
        partial_uc = dict(partial_uc)
        with open(out_pickle, 'w') as f:
            dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
        logging.debug("{f} created.".format(f=out_pickle))
Esempio n. 4
0
    def from_file(cls, cfg_fn):
        """read from a config file with
        SAMPLE=<name>;<path>

        GROUP_FILENAME=
        GFF_FILENAME=
        COUNT_FILENAME=
        """
        sample_names, sample_paths = [], []
        group_fn = gff_fn = abundance_fn = None
        for line in [line.strip() for line in open(realpath(cfg_fn), 'r')]:
            # read and process
            if line.startswith('SAMPLE='):
                name, path = line.strip()[7:].split(';')
                sample_names.append(name)
                sample_paths.append(realpath(path))
            elif line.startswith('GROUP_FILENAME='):
                group_fn = line.strip()[len('GROUP_FILENAME='):]
            elif line.startswith('GFF_FILENAME='):
                gff_fn = line.strip()[len('GFF_FILENAME='):]
            elif line.startswith('COUNT_FILENAME='):
                abundance_fn = line.strip()[len('COUNT_FILENAME='):]
        try:
            return ChainConfig(sample_names=sample_names, sample_paths=sample_paths,
                               group_fn=group_fn, gff_fn=gff_fn, abundance_fn=abundance_fn)
        except ValueError as e:
            raise ValueError("%s is an invalid ChainConfig file: %s" % (realpath(cfg_fn), str(e)))
Esempio n. 5
0
    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log(
                "Output directory {d} already exists.".format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException(
                "Consensus FASTA file {f} already exists.".format(f=out_fa))
        out_fa_dataset = None
        if out_fa.endswith(".contigset.xml"):
            out_fa_dataset = out_fa
            out_fa = re.sub(".contigset.xml", ".fasta", out_fa)
        return root_dir, out_fa, out_fa_dataset
def make_sane(args):
    """Make sane of input output"""
    args.smrtlink_job_dir = realpath(args.smrtlink_job_dir)
    args.out_dir = realpath(args.out_dir)

    if args.gmap_db is None:
        args.gmap_db = realpath(GMAP_DB)
        log.warning("Reset GMAP DB to %s", args.gmap_db)

    if args.gmap_name is None:
        args.gmap_name = GMAP_NAME
        log.warning("Reset GMAP NAME to %s", args.gmap_name)

    if not op.exists(args.smrtlink_job_dir):
        raise IOError("SMRTLink job directory %s does not exist" %
                      args.smrtlink_job_dir)

    if not op.exists(op.join(args.gmap_db, args.gmap_name)):
        raise IOError("GMAP reference %s/%s does not exist." %
                      (args.gmap_db, args.gmap_name))

    if not op.exists(args.gencode_gtf):
        raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf)

    log.info("Making out_dir %s", args.out_dir)
    mkdir(args.out_dir)
    return args
Esempio n. 7
0
    def _validate_outputs(self, _root_dir, _out_fa):
        """Validate outputs, create root_dir if it does not exist."""
        self.add_log("Checking outputs.", level=logging.INFO)
        root_dir, out_fa = _root_dir, _out_fa
        if root_dir is None:
            self.add_log("Output directory needs to be specified.",
                         level=logging.ERROR)
        if out_fa is None:
            self.add_log("Output consensus fasta needs to be specified.",
                         level=logging.ERROR)

        root_dir = realpath(root_dir)
        out_fa = realpath(out_fa)

        if op.exists(root_dir):
            self.add_log("Output directory {d} already exists.".
                         format(d=root_dir))
        else:
            self.add_log("Creating output directory {d}.".format(d=root_dir))
            os.mkdir(root_dir)
        if op.exists(out_fa):
            raise ClusterException("Consensus FASTA file {f} already exists.".
                                   format(f=out_fa))
        out_fa_dataset = None
        if out_fa.endswith(".contigset.xml"):
            out_fa_dataset = out_fa
            out_fa = re.sub(".contigset.xml", ".fasta", out_fa)
        return root_dir, out_fa, out_fa_dataset
Esempio n. 8
0
    def from_file(cls, cfg_fn):
        """read from a config file with
        SAMPLE=<name>;<path>

        GROUP_FILENAME=
        GFF_FILENAME=
        COUNT_FILENAME=
        """
        sample_names, sample_paths = [], []
        group_fn = gff_fn = abundance_fn = None
        for line in [line.strip() for line in open(realpath(cfg_fn), 'r')]:
            # read and process
            if line.startswith('SAMPLE='):
                name, path = line.strip()[7:].split(';')
                sample_names.append(name)
                sample_paths.append(realpath(path))
            elif line.startswith('GROUP_FILENAME='):
                group_fn = line.strip()[len('GROUP_FILENAME='):]
            elif line.startswith('GFF_FILENAME='):
                gff_fn = line.strip()[len('GFF_FILENAME='):]
            elif line.startswith('COUNT_FILENAME='):
                abundance_fn = line.strip()[len('COUNT_FILENAME='):]
        try:
            return ChainConfig(sample_names=sample_names,
                               sample_paths=sample_paths,
                               group_fn=group_fn,
                               gff_fn=gff_fn,
                               abundance_fn=abundance_fn)
        except ValueError as e:
            raise ValueError("%s is an invalid ChainConfig file: %s" %
                             (realpath(cfg_fn), str(e)))
Esempio n. 9
0
    def __init__(self,
                 root_dir,
                 flnc_fa,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 out_fa,
                 sge_opts,
                 ice_opts,
                 ipq_opts,
                 report_fn=None,
                 summary_fn=None,
                 fasta_fofn=None,
                 output_pickle_file=None,
                 tmp_dir=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn,
                                      fasta_fofn=fasta_fofn,
                                      tmp_dir=tmp_dir)

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments

        self.output_pickle_file = output_pickle_file
        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = \
            self._validate_inputs(_flnc_fa=flnc_fa, _nfl_fa=nfl_fa,
                                  _ccs_fofn=ccs_fofn,
                                  _fasta_fofn=fasta_fofn,
                                  quiver=self.ice_opts.quiver)

        self.root_dir, self.out_fa, self.out_fa_dataset = \
            self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None  # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()  # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.add_log("Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),\
                     level=logging.INFO)

        self.report_fn = realpath(report_fn) if report_fn is not None \
            else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
            else op.join(self.root_dir, "cluster_summary.txt")

        self.add_log("A Cluster Object created.", level=logging.INFO)
Esempio n. 10
0
    def _validate_inputs(self, fasta_filenames, ref_fasta):
        """Validate input files."""
        for f in fasta_filenames:
            if not op.exists(f):
                raise IOError("Input fasta {f} does not exist.".format(f=f))
        if ref_fasta is None or not op.exists(ref_fasta):
            raise IOError("Reference {r} does not exist.".format(r=ref_fasta))

        return ([realpath(f) for f in fasta_filenames], realpath(ref_fasta))
Esempio n. 11
0
    def __init__(self,
                 query_filename,
                 target_filename,
                 is_FL,
                 same_strand_only,
                 query_converted=False,
                 target_converted=False,
                 dazz_dir=None,
                 script_dir="scripts/",
                 use_sge=False,
                 sge_opts=None,
                 cpus=24):
        """
        Parameters:
          query_filename - query FASTA file
          target_filename - target FASTA file
          is_FL - whether or not reads are FLNC CCS reads
          same_strand_only - whether or not align reads in reverse strand
          query_converted - whether or not query FASTA file been converted
                            to daligner compatible FASTA file.
          target_converted - whether or not target FASTA file been converted
                             to daligner compatible FASTA file.
          dazz_dir - if None, all query.dazz.* files will be saved in the same
                     directory as query and all target.dazz.* files will be
                     saved in the same dir as target.
                     if a valid path, all query.dazz.* files and target.dazz.*
                     files will be saved to dazz_dir.
          script_dir - directory for saving all scripts

          use_sge - submit daligner jobs to sge or run them locally?
          sge_opts - sge options
          cpus - total number of cpus that can be used to align query to target.
        """
        self.query_filename = realpath(query_filename)
        self.target_filename = realpath(target_filename)
        self.is_FL = is_FL
        self.same_strand_only = same_strand_only
        self.cpus = cpus
        self.dazz_dir = dazz_dir
        self.script_dir = realpath(script_dir)
        self.output_dir = ""

        self.query_dazz_handler = DazzIDHandler(self.query_filename,
                                                converted=query_converted,
                                                dazz_dir=dazz_dir)
        # target may have already been converted (if shared)
        target_converted = (target_converted
                            or self.query_filename == self.target_filename)
        self.target_dazz_handler = DazzIDHandler(self.target_filename,
                                                 converted=target_converted,
                                                 dazz_dir=dazz_dir)

        self.target_blocks = self.target_dazz_handler.num_blocks
        self.query_blocks = self.query_dazz_handler.num_blocks

        self.use_sge = use_sge
        self.sge_opts = sge_opts
Esempio n. 12
0
    def _validate_inputs(self, fasta_filenames, ref_fasta):
        """Validate input files."""
        for f in fasta_filenames:
            if not op.exists(f):
                raise IOError("Input fasta {f} does not exist.".format(f=f))
        if ref_fasta is None or not op.exists(ref_fasta):
            raise IOError("Reference {r} does not exist.".format(r=ref_fasta))

        return ([realpath(f) for f in fasta_filenames],
                realpath(ref_fasta))
Esempio n. 13
0
    def __init__(self, query_filename, target_filename,
                 is_FL, same_strand_only,
                 query_converted=False, target_converted=False,
                 dazz_dir=None, script_dir="scripts/",
                 use_sge=False, sge_opts=None, cpus=24):
        """
        Parameters:
          query_filename - query FASTA file
          target_filename - target FASTA file
          is_FL - whether or not reads are FLNC CCS reads
          same_strand_only - whether or not align reads in reverse strand
          query_converted - whether or not query FASTA file been converted
                            to daligner compatible FASTA file.
          target_converted - whether or not target FASTA file been converted
                             to daligner compatible FASTA file.
          dazz_dir - if None, all query.dazz.* files will be saved in the same
                     directory as query and all target.dazz.* files will be
                     saved in the same dir as target.
                     if a valid path, all query.dazz.* files and target.dazz.*
                     files will be saved to dazz_dir.
          script_dir - directory for saving all scripts

          use_sge - submit daligner jobs to sge or run them locally?
          sge_opts - sge options
          cpus - total number of cpus that can be used to align query to target.
        """
        self.query_filename = realpath(query_filename)
        self.target_filename = realpath(target_filename)
        self.is_FL = is_FL
        self.same_strand_only = same_strand_only
        self.cpus = cpus
        self.dazz_dir = dazz_dir
        self.script_dir = realpath(script_dir)
        self.output_dir = ""

        self.query_dazz_handler = DazzIDHandler(self.query_filename,
                                                converted=query_converted,
                                                dazz_dir=dazz_dir)
        # target may have already been converted (if shared)
        target_converted = (target_converted or
                            self.query_filename == self.target_filename)
        self.target_dazz_handler = DazzIDHandler(self.target_filename,
                                                 converted=target_converted,
                                                 dazz_dir=dazz_dir)

        self.target_blocks = self.target_dazz_handler.num_blocks
        self.query_blocks = self.query_dazz_handler.num_blocks

        self.use_sge = use_sge
        self.sge_opts = sge_opts
Esempio n. 14
0
    def __init__(self, root_dir, flnc_fa, nfl_fa,
                 bas_fofn, ccs_fofn, out_fa,
                 sge_opts, ice_opts, ipq_opts,
                 report_fn=None, summary_fn=None,
                 fasta_fofn=None, output_pickle_file=None,
                 tmp_dir=None):
        super(Cluster, self).__init__(prog_name="Cluster",
                                      root_dir=root_dir,
                                      bas_fofn=bas_fofn,
                                      ccs_fofn=ccs_fofn,
                                      fasta_fofn=fasta_fofn,
                                      tmp_dir=tmp_dir)

        self.sge_opts = sge_opts  # SGE, CPU arguments and etc
        self.ice_opts = ice_opts  # ICE clustering algorithm arguments
        self.ipq_opts = ipq_opts  # IceQuiver HQ/LQ isoform arguments

        self.output_pickle_file = output_pickle_file
        self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = \
            self._validate_inputs(_flnc_fa=flnc_fa, _nfl_fa=nfl_fa,
                                  _ccs_fofn=ccs_fofn,
                                  _fasta_fofn=fasta_fofn,
                                  quiver=self.ice_opts.quiver)

        self.root_dir, self.out_fa, self.out_fa_dataset = \
            self._validate_outputs(root_dir, out_fa)

        self.sanity_check()

        self._probqv = None     # probability & quality value

        self._flnc_splitted_fas = []  # split flnc_fa into smaller files.
        self._nflncSplittedFas = []  # split nfl_fa into smaller files.
        self._logConfigs()      # Log configurations

        self.iceinit = None
        self.icec = None
        self.iceq = None
        self.pol = None

        self.add_log("Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len),\
                     level=logging.INFO)

        self.report_fn = realpath(report_fn) if report_fn is not None \
            else op.join(self.root_dir, "cluster_report.csv")
        self.summary_fn = realpath(summary_fn) if summary_fn is not None \
            else op.join(self.root_dir, "cluster_summary.txt")

        self.add_log("A Cluster Object created.", level=logging.INFO)
Esempio n. 15
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True,
                            same_strand_only=True,
                            query_converted=False,
                            target_converted=False,
                            use_sge=False,
                            cpus=4,
                            sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Esempio n. 16
0
    def __init__(self, input_filename, converted=False, dazz_dir=None):
        """
        input_filename - input FASTA/FASTQ/ContigSet file
        converted - whether or not input file has been converted to
                    daligner compatible FASTA file.
        dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files
                  in the same directory as inputfile.
                  if a valid path, save all output files to dazz_dir.
        """
        self.dazz_dir = dazz_dir
        self.input_filename = realpath(input_filename)
        self.validate_file_type(self.input_filename)

        # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS
        self.dazz_mapping = {}

        if converted and not nfs_exists(self.db_filename):
            log.warning(
                str(self.input_filename) +
                " should have been converted to daligner-compatible" +
                " format, but in fact it is not. Converting ...")
            converted = False

        if not converted:
            self.convert_to_dazz_fasta()
            self.make_db()
        else:
            self.read_dazz_pickle()
Esempio n. 17
0
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, ipq_opts, fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn, tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
Esempio n. 18
0
    def __init__(self, input_filename, converted=False, dazz_dir=None):
        """
        input_filename - input FASTA/FASTQ/ContigSet file
        converted - whether or not input file has been converted to
                    daligner compatible FASTA file.
        dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files
                  in the same directory as inputfile.
                  if a valid path, save all output files to dazz_dir.
        """
        self.dazz_dir = dazz_dir
        self.input_filename = realpath(input_filename)
        self.validate_file_type(self.input_filename)

        # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS
        self.dazz_mapping = {}

        if converted and not nfs_exists(self.db_filename):
            log.warning(str(self.input_filename) +
                        " should have been converted to daligner-compatible" +
                        " format, but in fact it is not. Converting ...")
            converted = False

        if not converted:
            self.convert_to_dazz_fasta()
            self.make_db()
        else:
            self.read_dazz_pickle()
Esempio n. 19
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True, same_strand_only=True,
                            query_converted=False, target_converted=False,
                            use_sge=False, cpus=4, sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Esempio n. 20
0
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = [
        'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)),
        'ls *.iit *meta', 'sleep 3',
        'cd %s' % real_upath(cwd)
    ]
    execute(' && '.join(cmd_args))

    cmd_args = [
        'gmap',
        '-D {d}'.format(d=real_upath(gmap_db_dir)),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc),
        '-n 0',
        '-z sense_force',
        '--cross-species',
        '-f samse',
        '--max-intronlength-ends 200000',  # for long genes
        real_upath(gmap_input_filename),
        '>',
        real_upath(unsorted_sam_filename),
        '2>{log}'.format(log=real_upath(log_filename))
    ]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
    def run(self):
        """Run"""
        writer = open(self.output_analysis_fn, 'w')
        writer.write("#isoseq_output_fn = %s\n" % realpath(self.isoseq_output_fn))
        writer.write("#reference_transcripts_fn = %s\n" % realpath(self.reference_transcripts_fn))

        writer.write("#total_num_isoforms = %s\n" % self.n_isoforms)
        writer.write("#total_num_reference_transcripts = %s\n" % self.n_refs)

        writer.write("#num_true_positive = %s\n" % self.n_true_positive)
        writer.write("#num_false_positive = %s\n" % self.n_false_positive)

        for ref in self.reference_transcripts:
            is_detected = ref.name in self.refs_detected
            writer.write("%s\t%s\t%s\n" % (ref.name, len(ref.sequence),
                                           'DETECTED' if is_detected else 'MISSED'))
        writer.close()
Esempio n. 22
0
    def _validate_inputs(self,
                         _flnc_fa,
                         _nfl_fa,
                         _ccs_fofn,
                         _fasta_fofn=None,
                         quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa = _flnc_fa, _nfl_fa
        ccs_fofn, fasta_fofn = _ccs_fofn, _fasta_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterException(
                "Input full-length non-chimeric reads " +
                "files (i.e., flnc_fa) needs to be specified.")
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterException(
                    "Input non-full-length reads file (i.e., nfl_fa)" +
                    " needs to be specified for isoform polish.")
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " +
                                       "non-chimeric reads: {fn}".format(
                                           fn=nfl_fa))

        if ccs_fofn is not None:
            try:
                ccs_fofn = validate_fofn(ccs_fofn)
            except IOError as e:
                raise ClusterException(str(e))

        if fasta_fofn is not None and quiver:
            try:
                fasta_fofn = validate_fofn(fasta_fofn)
            except IOError as e:
                raise ClusterException(str(e))

        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
Esempio n. 23
0
    def run(self):
        """Run"""
        writer = open(self.output_analysis_fn, 'w')
        writer.write("#isoseq_output_fn = %s\n" %
                     realpath(self.isoseq_output_fn))
        writer.write("#reference_transcripts_fn = %s\n" %
                     realpath(self.reference_transcripts_fn))

        writer.write("#total_num_isoforms = %s\n" % self.n_isoforms)
        writer.write("#total_num_reference_transcripts = %s\n" % self.n_refs)

        writer.write("#num_true_positive = %s\n" % self.n_true_positive)
        writer.write("#num_false_positive = %s\n" % self.n_false_positive)

        for ref in self.reference_transcripts:
            is_detected = ref.name in self.refs_detected
            writer.write("%s\t%s\t%s\n" % (ref.name, len(
                ref.sequence), 'DETECTED' if is_detected else 'MISSED'))
        writer.close()
Esempio n. 24
0
def map_isoforms_and_sort(input_filename, sam_filename,
                          gmap_db_dir, gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" % gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name),
                'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd]
    execute(' && '.join(cmd_args))

    cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir),
                '-d {name}'.format(name=gmap_db_name),
                '-t {nproc}'.format(nproc=gmap_nproc),
                '-n 0',
                '-z sense_force',
                '--cross-species',
                '-f samse',
                gmap_input_filename,
                '>', unsorted_sam_filename,
                '2>{log}'.format(log=log_filename)]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
Esempio n. 25
0
    def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, _fasta_fofn=None,
                         quiver=False):
        """Validate input files and return absolute expaneded paths."""
        flnc_fa, nfl_fa = _flnc_fa, _nfl_fa
        ccs_fofn, fasta_fofn = _ccs_fofn, _fasta_fofn
        self.add_log("Checking input files.", level=logging.INFO)
        if flnc_fa is None:
            raise ClusterException("Input full-length non-chimeric reads " +
                                   "files (i.e., flnc_fa) needs to be specified.")
        else:
            flnc_fa = realpath(flnc_fa)
            if not op.exists(flnc_fa):
                raise ClusterException("Unable to find full-length " +
                                       "non-chimeric reads: {fn}".format(fn=flnc_fa))

        if nfl_fa is None:
            if quiver is True:
                raise ClusterException("Input non-full-length reads file (i.e., nfl_fa)" +
                                       " needs to be specified for isoform polish.")
        else:
            nfl_fa = realpath(nfl_fa)
            if not op.exists(nfl_fa):
                raise ClusterException("Unable to find non-full-length " +
                                       "non-chimeric reads: {fn}".format(fn=nfl_fa))

        if ccs_fofn is not None:
            try:
                ccs_fofn = validate_fofn(ccs_fofn)
            except IOError as e:
                raise ClusterException(str(e))

        if fasta_fofn is not None and quiver:
            try:
                fasta_fofn = validate_fofn(fasta_fofn)
            except IOError as e:
                raise ClusterException(str(e))

        return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
Esempio n. 26
0
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 ipq_opts,
                 fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn,
                          tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(
            self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
Esempio n. 27
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    cmd = " ".join([
        gcon_py,
        real_upath(testInFa),
        "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1"
    ])
    write_cmd_to_script(cmd=cmd, script=testSh)

    assert op.exists(testSh)
    cmd = sge_opts.qsub_cmd(script=real_upath(testSh),
                            num_threads=1,
                            wait_before_exit=True)

    logging.debug("Submitting cmd: " + cmd)
    backticks(cmd)

    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
Esempio n. 28
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    cmd = " ".join([gcon_py, real_upath(testInFa),
                    "{testDir}/g_consensus".format(testDir=real_upath(testDir)),
                    "c1"])
    write_cmd_to_script(cmd=cmd, script=testSh)

    assert op.exists(testSh)
    cmd = sge_opts.qsub_cmd(script=real_upath(testSh),
                            num_threads=1, wait_before_exit=True)

    logging.debug("Submitting cmd: " + cmd)
    backticks(cmd)

    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
Esempio n. 29
0
    def __init__(self, flnc_filename, root_dir, out_pickle, output_basename):
        """
        Reads in input flnc file will be separated into multiple categories
        according to separation criterion, and reads in each category will
        be written into
            <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml

        e.g., if reads are separated by primers, then reads will be written to
        <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml

        Parameters:
          flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET
          root_dir - output root directory
          output_basename - output file basename
        """
        self.flnc_filename = flnc_filename
        self.root_dir = realpath(root_dir)
        mkdir(root_dir)
        self.output_basename = output_basename
        self.create_contigset = True if flnc_filename.endswith(".xml") else False
        self.handles = {} # key --> fasta file handler
        self.out_pickle = out_pickle if out_pickle is not None \
                          else op.join(self.root_dir, "separate_flnc.pickle")
Esempio n. 30
0
    def __init__(self, flnc_filename, root_dir, out_pickle, output_basename):
        """
        Reads in input flnc file will be separated into multiple categories
        according to separation criterion, and reads in each category will
        be written into
            <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml

        e.g., if reads are separated by primers, then reads will be written to
        <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml

        Parameters:
          flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET
          root_dir - output root directory
          output_basename - output file basename
        """
        self.flnc_filename = flnc_filename
        self.root_dir = realpath(root_dir)
        mkdir(root_dir)
        self.output_basename = output_basename
        self.create_contigset = True if flnc_filename.endswith(".xml") else False
        self.handles = {} # key --> fasta file handler
        self.out_pickle = out_pickle if out_pickle is not None \
                          else op.join(self.root_dir, "separate_flnc.pickle")
Esempio n. 31
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name, sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3,
                                    hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons, sge_opts=sge_opts,
                      ice_opts=ice_opts, ipq_opts=ipq_opts)

        if args.mem_debug: # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir,
                                                                            end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files: # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs, ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms)

    post_mapping_to_genome_runner(
        in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam,
        in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn,
        out_gff=args.gff_fn, out_abundance=args.abundance_fn,
        out_group=args.group_fn, out_read_stat=args.read_stat_fn,
        min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity,
        min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction,
        allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count)

    return 0
Esempio n. 32
0
    def __init__(self,
                 reads_fn="test.fasta",
                 out_dir="output/",
                 out_reads_fn="testout.fasta",
                 primer_fn=None,
                 primer_report_fn=None,
                 summary_fn=None,
                 cpus=1,
                 change_read_id=True,
                 opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False),
                 out_nfl_fn=None,
                 out_flnc_fn=None,
                 ignore_polyA=False,
                 reuse_dom=False,
                 ignore_empty_output=False):
        self.reads_fn = realpath(reads_fn)
        self.out_dir = realpath(out_dir)
        self.cpus = cpus
        self.change_read_id = change_read_id
        self.chimera_detection_opts = opts
        self.ignore_polyA = ignore_polyA
        self.reuse_dom = reuse_dom
        self.ignore_empty_output = ignore_empty_output
        self._numReads = None

        # The input primer file: primers.fasta
        self.primer_fn = primer_fn if primer_fn is not None else \
            op.join(self.data_dir, PRIMERFN)
        # The output fasta file.
        self.out_all_reads_fn = realpath(out_reads_fn)

        # Intermediate output fasta file before chimera detection.
        #     trimmed full-length reads: fl.trimmed.fasta
        # and
        #     trimmed non-full-length reads: nfl.trimmed.fasta
        self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta")
        self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta")

        self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN)
        self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN)

        # The output primer file: primer_info.csv
        self.primer_report_fn = primer_report_fn \
            if primer_report_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN
        # primer reports for nfl reads before chimera detection. Note that
        # chimera detection is not necessary for nfl reads.
        self._primer_report_nfl_fn = op.join(self.out_dir,
                                             "primer_report.nfl.csv")
        # primer reports for fl reads after chimera detection. Note that
        # chimera detection is required for fl reads.
        self._primer_report_fl_fn = op.join(self.out_dir,
                                            "primer_report.fl.csv")

        # The matrix file: PBMATRIX.txt
        self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN)

        # The output phmmer Dom file for trimming primers: hmmer.front_end.dom
        self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN)
        # The output phmmer Dom file for chimera detection:
        #     hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom
        self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN)
        self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN)

        self.chunked_front_back_reads_fns = None
        self.chunked_front_back_dom_fns = None

        #self.chunked_trimmed_reads_fns = None
        #self.chunked_trimmed_reads_dom_fns = None

        # The summary file: *.classify_summary.txt
        self.summary = ClassifySummary()
        self.summary_fn = summary_fn if summary_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + \
            "." + CLASSIFYSUMMARY

        self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \
            else op.join(self.out_dir, "nfl.fasta")
        self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta")
        self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta")

        self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \
            else op.join(self.out_dir, "flnc.fasta")
        self.out_flc_fn = op.join(self.out_dir, "flc.fasta")

        for file_attr in [
                "out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn",
                "out_flc_fn", "out_all_reads_fn"
        ]:
            file_name = fasta_file_name = getattr(self, file_attr)
            if file_name.endswith(".xml"):
                fasta_file_name = ".".join(
                    file_name.split(".")[:-2]) + ".fasta"
            setattr(self, "%s_fasta" % file_attr, fasta_file_name)
Esempio n. 33
0
 def __init__(self, combined_dir):
     self.combined_dir = realpath(combined_dir)
     mkdir(self.combined_dir)
Esempio n. 34
0
def post_mapping_to_genome_runner(
        in_isoforms,
        in_sam,
        in_pickle,
        out_isoforms,
        out_gff,
        out_abundance,
        out_group,
        out_read_stat,
        min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
        min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
        min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
        max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
        allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
        skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
        min_count=fci.Constants.MIN_COUNT_DEFAULT,
        to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    log.info('args: {!r}'.format(locals()))
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError(
            "Format of input and output isoforms %s, %s must be the same." %
            (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input and output isoforms %s, %s must be FASTA or FASTQ."
            % (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn,
                     pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn,
                    in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn,
                    in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix,
                        allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count,
                        filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [
        (fff.filtered_rep_fn(out_suffix), out_isoforms),  # rep isoforms
        (fff.filtered_gff_fn, out_gff),  # gff annotation
        (fff.filtered_abundance_fn, out_abundance),  # abundance info
        (fff.group_fn, out_group),  # groups
        (fff.read_stat_fn, out_read_stat)
    ]  # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info(
        "Collapsed and filtered isoform sequences written to %s",
        realpath(out_isoforms) if out_isoforms is not None else realpath(
            fff.filtered_rep_fn(out_suffix)))
    logging.info(
        "Collapsed and filtered isoform annotations written to %s",
        realpath(out_gff)
        if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info(
        "Collapsed and filtered isoform abundance info written to %s",
        realpath(out_abundance)
        if out_abundance is not None else realpath(fff.filtered_abundance_fn))
    logging.info(
        "Collapsed isoform groups written to %s",
        realpath(out_group)
        if out_group is not None else realpath(fff.group_fn))
    logging.info(
        "Read status of FL and nFL reads written to %s",
        realpath(out_read_stat)
        if out_read_stat is not None else realpath(fff.read_stat_fn))
Esempio n. 35
0
def build_uc_from_partial_daligner(input_fasta,
                                   ref_fasta,
                                   out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False,
                            same_strand_only=False,
                            query_converted=False,
                            target_converted=True,
                            dazz_dir=tmp_dir,
                            script_dir=op.join(output_dir, "script"),
                            use_sge=False,
                            sge_opts=None,
                            cpus=cpus)
    runner.run(min_match_len=300,
               output_dir=output_dir,
               sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn,
                                    fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn,
                             input_fasta,
                             time.time() - start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s", input_fasta,
                             ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs", input_fastq,
                             time.time() - start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(
            query_dazz_handler=runner.query_dazz_handler,
            target_dazz_handler=runner.target_dazz_handler,
            la4ice_filename=la4ice_filename,
            is_FL=False,
            sID_starts_with_c=True,
            qver_get_func=probqv.get_smoothed,
            qvmean_get_func=probqv.get_mean,
            ece_penalty=1,
            ece_min_len=20,
            same_strand_only=False,
            no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec", la4ice_filename,
                     str(time.time() - start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 36
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                                   done_filename,
                                   ice_opts,
                                   probqv,
                                   qv_prob_threshold=0.3,
                                   cpus=4,
                                   no_qv_or_aln_checking=False,
                                   tmp_dir=None,
                                   sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using DALIGNER, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False, same_strand_only=False,
                            query_converted=False, target_converted=True,
                            dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"),
                            use_sge=False, sge_opts=None, cpus=cpus)
    runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()


        # not providing full_missed_start/end since aligning nFLs, ok to partially align only
        hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler,
                                        target_dazz_handler=runner.target_dazz_handler,
                                        la4ice_filename=la4ice_filename,
                                        is_FL=False, sID_starts_with_c=sID_starts_with_c,
                                        qver_get_func=probqv.get_smoothed,
                                        qvmean_get_func=probqv.get_mean,
                                        qv_prob_threshold=qv_prob_threshold,
                                        ece_penalty=ice_opts.ece_penalty,
                                        ece_min_len=ice_opts.ece_min_len,
                                        same_strand_only=True,
                                        no_qv_or_aln_checking=no_qv_or_aln_checking,
                                        max_missed_start=ice_opts.max_missed_start,
                                        max_missed_end=ice_opts.max_missed_end,
                                        full_missed_start=ice_opts.full_missed_start,
                                        full_missed_end=ice_opts.full_missed_end)


        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec",
                     la4ice_filename, str(time.time()-start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 37
0
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle,
                                done_filename,
                                ice_opts,
                                probqv,
                                qv_prob_threshold=0.3,
                                cpus=4,
                                no_qv_or_aln_checking=False,
                                tmp_dir=None,
                                sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=cpus) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)


    logging.info("Calling blasr_against_ref ...")

    # no need to provide full_missed_start/end for nFLs, since is_FL = False
    hitItems = blasr_against_ref2(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=sID_starts_with_c,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 qv_prob_threshold=qv_prob_threshold,
                                 ece_penalty=ice_opts.ece_penalty,
                                 ece_min_len=ice_opts.ece_min_len,
                                 max_missed_start=ice_opts.max_missed_start,
                                 max_missed_end=ice_opts.max_missed_end,
                                 full_missed_start=ice_opts.full_missed_start,
                                 full_missed_end=ice_opts.full_missed_end,
                                 same_strand_only=False)


    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
 def __init__(self, root_dir):
     self.root_dir = realpath(root_dir)
Esempio n. 39
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False, same_strand_only=False,
                            query_converted=False, target_converted=True,
                            dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"),
                            use_sge=False, sge_opts=None, cpus=cpus)
    runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs",
                             ccs_fofn, input_fasta, time.time()-start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s",
                             input_fasta, ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs",
                             input_fastq, time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(query_dazz_handler=runner.query_dazz_handler,
                                        target_dazz_handler=runner.target_dazz_handler,
                                        la4ice_filename=la4ice_filename,
                                        is_FL=False,
                                        sID_starts_with_c=True,
                                        qver_get_func=probqv.get_smoothed,
                                        qvmean_get_func=probqv.get_mean,
                                        ece_penalty=1,
                                        ece_min_len=20,
                                        same_strand_only=False,
                                        no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec",
                     la4ice_filename, str(time.time()-start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 40
0
    def run(self, output_dir='.', min_match_len=300, sensitive_mode=False):
        """
        if self.use_sge --- writes to <scripts>/daligner_job_#.sh
        else --- run locally, dividing into self.cpus/4 tasks (capped max at 4)

        NOTE 1: when using SGE, be careful that multiple calls to this might
        end up writing to the SAME job.sh files, this should be avoided by
        changing <scripts> directory

        NOTE 2: more commonly this should be invoked locally
        (since ice_partial.py i/one be qsub-ed),
        in that case it is more recommended to keep self.cpus = 4 so that
        each daligner job is run consecutively and that the original qsub job
        should have been called with qsub -pe smp 4 (set by --blasr_nproc 4)
        In this way, the daligner jobs are called consecutively, but LA4Ice
        is parallelized 4X
        """
        self.output_dir = realpath(output_dir) # Reset output_dir
        old_dir = realpath(op.curdir)
        mkdir(output_dir)
        os.chdir(output_dir)

        if self.use_sge:
            mknewdir(self.script_dir)

        # prepare done scripts is no longer necessary.
        #self.write_daligner_done_script()
        #self.write_la4ice_done_script()

        # (a) run all daligner jobs
        daligner_cmds = self.daligner_cmds(min_match_len=min_match_len,
                                           sensitive_mode=sensitive_mode)

        logging.info("Start daligner cmds " +
                     ("using sge." if self.use_sge else "locally."))
        logging.debug("CMD: " + "\n".join(daligner_cmds))

        start_t = time.time()
        failed = []
        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=daligner_cmds,
                               script_files=self.daligner_scripts,
                               #done_script=self.daligner_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=daligner_cmds,
                                 num_threads=max(1, min(self.cpus/4, 4))))
        logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.")

        # (b) run all LA4Ice jobs
        start_t = time.time()
        logging.info("Start LA4Ice cmds " +
                     ("using sge." if self.use_sge else "locally."))
        la4ice_cmds = self.la4ice_cmds
        logging.debug("CMD: " + "\n".join(la4ice_cmds))

        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=la4ice_cmds,
                               script_files=self.la4ice_scripts,
                               #done_script=self.la4ice_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=la4ice_cmds,
                                 num_threads=max(1, min(self.cpus, 4))))
        logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.")
        os.chdir(old_dir)

        if len(failed) == 0:
            return 0
        else:
            raise RuntimeError("%s.run failed, %s." %
                               (op.basename(self.__class__),
                                "\n".join([x[0] for x in failed])))
Esempio n. 41
0
def args_runner(args):
    """Run given input args, e.g.,
    filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2
    filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 --no_filter_subsets
    """
    in_fq, out_fq = args.in_rep_fastq, args.out_rep_fastq

    def _get_prefix_of_rep_fq(fn):
        """Return prefix of *.rep.fq"""
        if fn.endswith(".rep.fastq") or fn.endswith(".rep.fq"):
            return '.'.join(fn.split(".")[0:-2])
        elif fn.endswith(".fastq") or fn.endswith(".fq"):
            return '.'.join(fn.split(".")[0:-1])
        raise ValueError("Invalid collapsed isoforms .rep.fastq file %s" % fn)

    input_prefix = _get_prefix_of_rep_fq(in_fq)
    output_prefix = _get_prefix_of_rep_fq(out_fq)

    # infer group.txt, abundance.txt and gff
    in_group_filename = input_prefix + ".group.txt"
    in_abundance_filename = input_prefix + ".abundance.txt"
    in_gff_filename = input_prefix + ".gff"

    tmp_out_abundance_filename = output_prefix + ".has_subsets.abundance.txt"
    tmp_out_gff_filename = output_prefix + ".has_subsets.gff"
    tmp_out_fq = output_prefix + ".has_subsets.rep.fastq"

    out_abundance_filename = output_prefix + ".abundance.txt"
    out_gff_filename = output_prefix + ".gff"

    # Filter collapsed isoforms by min FL count.
    logging.info("Filtering collapsed isoforms by count %s", args.min_count)
    filter_by_count(in_group_filename=in_group_filename,
                    in_abundance_filename=in_abundance_filename,
                    in_gff_filename=in_gff_filename, in_rep_filename=in_fq,
                    out_abundance_filename=tmp_out_abundance_filename,
                    out_gff_filename=tmp_out_gff_filename, out_rep_filename=tmp_out_fq,
                    min_count=args.min_count)

    # Remove collapsed isoforms which are a subset of another isoform
    logging.info("Filtering out subsets collapsed isoforms = %s", args.filter_out_subsets)
    if args.filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=tmp_out_abundance_filename,
                           in_gff_filename=tmp_out_gff_filename,
                           in_rep_filename=tmp_out_fq,
                           out_abundance_filename=out_abundance_filename,
                           out_gff_filename=out_gff_filename,
                           out_rep_filename=out_fq,
                           max_fuzzy_junction=args.max_fuzzy_junction)
        rmpath(tmp_out_abundance_filename)
        rmpath(tmp_out_gff_filename)
        rmpath(tmp_out_fq)
    else:
        mv(tmp_out_abundance_filename, out_abundance_filename)
        mv(tmp_out_gff_filename, out_gff_filename)
        mv(tmp_out_fq, out_fq)

    logging.info("Filtered collapsed isoforms sequences written to %s", realpath(out_fq))
    logging.info("Filtered collapsed isoforms abundance written to %s", realpath(out_abundance_filename))
    logging.info("Filtered collapsed isoforms gff written to %s", realpath(out_gff_filename))
    return 0
Esempio n. 42
0
    def run(self):
        """
        First, collapse input isoforms by calling Branch.run().
        Then collapse fuzzy junctions by calling collapse_fuzzy_junctions.
        Finally, pick up representitive gff record for each group of collapsed isoforms.
        """
        self.validate_inputs()

        logging.info("Collapsing isoforms into transcripts.")
        b = Branch(isoform_filename=self.isoform_filename,
                   sam_filename=self.sam_filename,
                   cov_threshold=self.min_flnc_coverage,
                   min_aln_coverage=self.min_aln_coverage,
                   min_aln_identity=self.min_aln_identity)

        b.run(allow_extra_5exon=self.allow_extra_5exon,
              skip_5_exon_alt=self.skip_5_exon_alt,
              ignored_ids_fn=self.ignored_ids_txt_fn,
              good_gff_fn=self.good_unfuzzy_gff_fn,
              bad_gff_fn=self.bad_unfuzzy_gff_fn,
              group_fn=self.unfuzzy_group_fn)

        logging.info("Good unfuzzy isoforms written to: %s",
                     realpath(self.good_unfuzzy_gff_fn))
        logging.info("Bad unfuzzy isoforms written to: %s",
                     realpath(self.bad_unfuzzy_gff_fn))
        logging.info("Unfuzzy isoform groups written to: %s",
                     realpath(self.unfuzzy_group_fn))

        if self.shall_collapse_fuzzy_junctions:
            logging.info("Further collapsing fuzzy junctions.")
            # need to further collapse those that have fuzzy junctions!
            collapse_fuzzy_junctions(
                gff_filename=self.good_unfuzzy_gff_fn,
                group_filename=self.unfuzzy_group_fn,
                fuzzy_gff_filename=self.good_fuzzy_gff_fn,
                fuzzy_group_filename=self.fuzzy_group_fn,
                allow_extra_5exon=self.allow_extra_5exon,
                max_fuzzy_junction=self.max_fuzzy_junction)

            logging.info("Good fuzzy isoforms written to: %s",
                         realpath(self.good_fuzzy_gff_fn))
            logging.info("Bad fuzzy isoforms written to: %s",
                         realpath(self.bad_fuzzy_gff_fn))
            logging.info("Fuzzy isoform groups written to: %s",
                         realpath(self.fuzzy_group_fn))
            ln(self.good_fuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_fuzzy_gff_fn, self.gff_fn)
            ln(self.fuzzy_group_fn, self.group_fn)
        else:
            logging.info("No need to further collapse fuzzy junctions.")
            ln(self.good_unfuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_unfuzzy_gff_fn, self.gff_fn)
            ln(self.unfuzzy_group_fn, self.group_fn)

        # Pick up representative
        logging.info("Picking up representative record.")
        pick_least_err_instead = not self.allow_extra_5exon  # 5merge, pick longest

        pick_rep(isoform_filename=self.isoform_filename,
                 gff_filename=self.good_gff_fn,
                 group_filename=self.group_fn,
                 output_filename=self.rep_fn(self.suffix),
                 pick_least_err_instead=pick_least_err_instead,
                 bad_gff_filename=self.bad_gff_fn)

        logging.info("Ignored IDs written to: %s",
                     realpath(self.ignored_ids_txt_fn))
        logging.info("Output GFF written to: %s", realpath(self.gff_fn))
        logging.info("Output Group TXT written to: %s",
                     realpath(self.group_fn))
        logging.info("Output collapsed isoforms written to: %s",
                     realpath(self.rep_fn(self.suffix)))
        logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
Esempio n. 43
0
 def __init__(self, root_dir):
     self.root_dir = realpath(root_dir)
Esempio n. 44
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12,
                          tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 45
0
 def __init__(self, combined_dir):
     self.combined_dir = realpath(combined_dir)
     mkdir(self.combined_dir)
Esempio n. 46
0
 def g(d, base_fn):
     """Convert file basename to abs file path"""
     return op.join(realpath(d), base_fn)
Esempio n. 47
0
    def run(self, output_dir='.', min_match_len=300, sensitive_mode=False):
        """
        if self.use_sge --- writes to <scripts>/daligner_job_#.sh
        else --- run locally, dividing into self.cpus/4 tasks (capped max at 4)

        NOTE 1: when using SGE, be careful that multiple calls to this might
        end up writing to the SAME job.sh files, this should be avoided by
        changing <scripts> directory

        NOTE 2: more commonly this should be invoked locally
        (since ice_partial.py i/one be qsub-ed),
        in that case it is more recommended to keep self.cpus = 4 so that
        each daligner job is run consecutively and that the original qsub job
        should have been called with qsub -pe smp 4 (set by --blasr_nproc 4)
        In this way, the daligner jobs are called consecutively, but LA4Ice
        is parallelized 4X
        """
        self.output_dir = realpath(output_dir) # Reset output_dir
        old_dir = realpath(op.curdir)
        mkdir(output_dir)
        os.chdir(output_dir)

        if self.use_sge:
            mknewdir(self.script_dir)

        # prepare done scripts is no longer necessary.
        #self.write_daligner_done_script()
        #self.write_la4ice_done_script()

        # (a) run all daligner jobs
        daligner_cmds = self.daligner_cmds(min_match_len=min_match_len,
                                           sensitive_mode=sensitive_mode)

        logging.info("Start daligner cmds " +
                     ("using sge." if self.use_sge else "locally."))
        logging.debug("CMD: " + "\n".join(daligner_cmds))

        start_t = time.time()
        failed = []
        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=daligner_cmds,
                               script_files=self.daligner_scripts,
                               #done_script=self.daligner_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=daligner_cmds,
                                 num_threads=max(1, min(self.cpus/4, 4))))
        logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.")

        # (b) run all LA4Ice jobs
        start_t = time.time()
        logging.info("Start LA4Ice cmds " +
                     ("using sge." if self.use_sge else "locally."))
        la4ice_cmds = self.la4ice_cmds
        logging.debug("CMD: " + "\n".join(la4ice_cmds))

        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=la4ice_cmds,
                               script_files=self.la4ice_scripts,
                               #done_script=self.la4ice_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=la4ice_cmds,
                                 num_threads=max(1, min(self.cpus, 4))))
        logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.")
        os.chdir(old_dir)

        if len(failed) == 0:
            return 0
        else:
            raise RuntimeError("%s.run failed, %s." %
                               (op.basename(self.__class__),
                                "\n".join([x[0] for x in failed])))
def post_mapping_to_genome_runner(in_isoforms, in_sam, in_pickle,
                                  out_isoforms, out_gff, out_abundance, out_group, out_read_stat,
                                  min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT,
                                  min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT,
                                  min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT,
                                  max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT,
                                  allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT,
                                  skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT,
                                  min_count=fci.Constants.MIN_COUNT_DEFAULT,
                                  to_filter_out_subsets=True):
    """
    (1) Collapse isoforms and merge fuzzy junctions if needed.
    (2) Generate read stat file and abundance file
    (3) Based on abundance file, filter collapsed isoforms by min FL count
    """
    # Check input and output format
    in_suffix = parse_ds_filename(in_isoforms)[1]
    out_prefix, out_suffix = parse_ds_filename(out_isoforms)
    if in_suffix != out_suffix:
        raise ValueError("Format of input and output isoforms %s, %s must be the same." %
                         (in_isoforms, out_isoforms))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError("Format of input and output isoforms %s, %s must be FASTA or FASTQ." %
                         (in_isoforms, out_isoforms))

    #(1) Collapse isoforms and merge fuzzy junctions if needed.
    cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon)
    cir = CollapseIsoformsRunner(isoform_filename=in_isoforms,
                                 sam_filename=in_sam,
                                 output_prefix=out_prefix,
                                 min_aln_coverage=min_aln_coverage,
                                 min_aln_identity=min_aln_identity,
                                 min_flnc_coverage=min_flnc_coverage,
                                 max_fuzzy_junction=max_fuzzy_junction,
                                 allow_extra_5exon=allow_extra_5exon,
                                 skip_5_exon_alt=skip_5_exon_alt)
    cir.run()

    # (2) Generate read stat file and abundance file
    cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle,
                     output_read_stat_filename=cf.read_stat_fn,
                     output_abundance_filename=cf.abundance_fn)
    cr.run()

    # (3) Filter collapsed isoforms by min FL count based on abundance file.
    fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=False)
    filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn,
                    in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix),
                    out_abundance_filename=fff.filtered_abundance_fn,
                    out_gff_filename=fff.filtered_gff_fn,
                    out_rep_filename=fff.filtered_rep_fn(out_suffix),
                    min_count=min_count)

    fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon,
                        min_count=min_count, filter_out_subsets=True)
    # (4) Remove collapsed isoforms which are a subset of another isoform
    if to_filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn,
                           in_gff_filename=fff.filtered_gff_fn,
                           in_rep_filename=fff.filtered_rep_fn(out_suffix),
                           out_abundance_filename=fft.filtered_abundance_fn,
                           out_gff_filename=fft.filtered_gff_fn,
                           out_rep_filename=fft.filtered_rep_fn(out_suffix),
                           max_fuzzy_junction=max_fuzzy_junction)
        fff = fft

    # (5) ln outputs files
    ln_pairs = [(fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms
                (fff.filtered_gff_fn, out_gff), # gff annotation
                (fff.filtered_abundance_fn, out_abundance), # abundance info
                (fff.group_fn, out_group), # groups
                (fff.read_stat_fn, out_read_stat)] # read stat info
    for src, dst in ln_pairs:
        if dst is not None:
            ln(src, dst)

    logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s",
                 min_count, filter_out_subsets)
    logging.info("Collapsed and filtered isoform sequences written to %s",
                 realpath(out_isoforms) if out_isoforms is not None else
                 realpath(fff.filtered_rep_fn(out_suffix)))
    logging.info("Collapsed and filtered isoform annotations written to %s",
                 realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn))
    logging.info("Collapsed and filtered isoform abundance info written to %s",
                 realpath(out_abundance) if out_abundance is not None else
                 realpath(fff.filtered_abundance_fn))
    logging.info("Collapsed isoform groups written to %s",
                 realpath(out_group) if out_group is not None else realpath(fff.group_fn))
    logging.info("Read status of FL and nFL reads written to %s",
                 realpath(out_read_stat) if out_read_stat is not None else
                 realpath(fff.read_stat_fn))
Esempio n. 49
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver,
                          use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty,
                          ece_min_len=args.ece_min_len,
                          flnc_reads_per_split=args.flnc_reads_per_split,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id,
                          use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs,
                          blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc,
                          gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name,
                          sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=args.qv_trim_5,
        qv_trim_3=args.qv_trim_3,
        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa,
                           root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb,
                           bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual,
                           max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(
        tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!",
                            ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir,
                      flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons,
                      sge_opts=sge_opts,
                      ice_opts=ice_opts,
                      ipq_opts=ipq_opts)

        if args.mem_debug:  # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(
                    split_dir, end_t - start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files:  # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs,
                      ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq,
                          sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db,
                          gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" %
                         out_isoforms)

    post_mapping_to_genome_runner(in_isoforms=in_isoforms,
                                  in_sam=tofu_f.sorted_gmap_sam,
                                  in_pickle=tofu_f.hq_lq_prefix_dict_pickle,
                                  out_isoforms=args.collapsed_filtered_fn,
                                  out_gff=args.gff_fn,
                                  out_abundance=args.abundance_fn,
                                  out_group=args.group_fn,
                                  out_read_stat=args.read_stat_fn,
                                  min_aln_coverage=args.min_aln_coverage,
                                  min_aln_identity=args.min_aln_identity,
                                  min_flnc_coverage=args.min_flnc_coverage,
                                  max_fuzzy_junction=args.max_fuzzy_junction,
                                  allow_extra_5exon=args.allow_extra_5exon,
                                  min_count=args.min_count)

    return 0
Esempio n. 50
0
    def __init__(self, reads_fn="test.fasta", out_dir="output/",
                 out_reads_fn="testout.fasta", primer_fn=None,
                 primer_report_fn=None, summary_fn=None,
                 cpus=1, change_read_id=True,
                 opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False),
                 out_nfl_fn=None, out_flnc_fn=None,
                 ignore_polyA=False, reuse_dom=False,
                 ignore_empty_output=False):
        self.reads_fn = realpath(reads_fn)
        self.out_dir = realpath(out_dir)
        self.cpus = cpus
        self.change_read_id = change_read_id
        self.chimera_detection_opts = opts
        self.ignore_polyA = ignore_polyA
        self.reuse_dom = reuse_dom
        self.ignore_empty_output = ignore_empty_output
        self._numReads = None

        # The input primer file: primers.fasta
        self.primer_fn = primer_fn if primer_fn is not None else \
            op.join(self.data_dir, PRIMERFN)
        # The output fasta file.
        self.out_all_reads_fn = realpath(out_reads_fn)

        # Intermediate output fasta file before chimera detection.
        #     trimmed full-length reads: fl.trimmed.fasta
        # and
        #     trimmed non-full-length reads: nfl.trimmed.fasta
        self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta")
        self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta")

        self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN)
        self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN)

        # The output primer file: primer_info.csv
        self.primer_report_fn = primer_report_fn \
            if primer_report_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN
        # primer reports for nfl reads before chimera detection. Note that
        # chimera detection is not necessary for nfl reads.
        self._primer_report_nfl_fn = op.join(self.out_dir,
                                             "primer_report.nfl.csv")
        # primer reports for fl reads after chimera detection. Note that
        # chimera detection is required for fl reads.
        self._primer_report_fl_fn = op.join(self.out_dir,
                                            "primer_report.fl.csv")

        # The matrix file: PBMATRIX.txt
        self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN)

        # The output phmmer Dom file for trimming primers: hmmer.front_end.dom
        self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN)
        # The output phmmer Dom file for chimera detection:
        #     hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom
        self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN)
        self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN)

        self.chunked_front_back_reads_fns = None
        self.chunked_front_back_dom_fns = None

        #self.chunked_trimmed_reads_fns = None
        #self.chunked_trimmed_reads_dom_fns = None

        # The summary file: *.classify_summary.txt
        self.summary = ClassifySummary()
        self.summary_fn = summary_fn if summary_fn is not None else \
            ".".join(out_reads_fn.split('.')[:-1]) + \
            "." + CLASSIFYSUMMARY

        self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \
            else op.join(self.out_dir, "nfl.fasta")
        self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta")
        self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta")

        self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \
            else op.join(self.out_dir, "flnc.fasta")
        self.out_flc_fn = op.join(self.out_dir, "flc.fasta")

        for file_attr in ["out_nfl_fn", "out_nflnc_fn", "out_nflc_fn",
                          "out_flnc_fn", "out_flc_fn", "out_all_reads_fn"]:
            file_name = fasta_file_name = getattr(self, file_attr)
            if file_name.endswith(".xml"):
                fasta_file_name = ".".join(file_name.split(".")[:-2])+".fasta"
            setattr(self, "%s_fasta" % file_attr, fasta_file_name)
Esempio n. 51
0
 def g(d, base_fn):
     """Convert file basename to abs file path"""
     return op.join(realpath(d), base_fn)
Esempio n. 52
0
    def run(self):
        """
        First, collapse input isoforms by calling Branch.run().
        Then collapse fuzzy junctions by calling collapse_fuzzy_junctions.
        Finally, pick up representitive gff record for each group of collapsed isoforms.
        """
        self.validate_inputs()

        logging.info("Collapsing isoforms into transcripts.")
        b = Branch(isoform_filename=self.isoform_filename,
                   sam_filename=self.sam_filename,
                   cov_threshold=self.min_flnc_coverage,
                   min_aln_coverage=self.min_aln_coverage,
                   min_aln_identity=self.min_aln_identity)

        b.run(allow_extra_5exon=self.allow_extra_5exon,
              skip_5_exon_alt=self.skip_5_exon_alt,
              ignored_ids_fn=self.ignored_ids_txt_fn,
              good_gff_fn=self.good_unfuzzy_gff_fn,
              bad_gff_fn=self.bad_unfuzzy_gff_fn,
              group_fn=self.unfuzzy_group_fn)

        logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn))
        logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn))
        logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn))

        if self.shall_collapse_fuzzy_junctions:
            logging.info("Further collapsing fuzzy junctions.")
            # need to further collapse those that have fuzzy junctions!
            collapse_fuzzy_junctions(gff_filename=self.good_unfuzzy_gff_fn,
                                     group_filename=self.unfuzzy_group_fn,
                                     fuzzy_gff_filename=self.good_fuzzy_gff_fn,
                                     fuzzy_group_filename=self.fuzzy_group_fn,
                                     allow_extra_5exon=self.allow_extra_5exon,
                                     max_fuzzy_junction=self.max_fuzzy_junction)

            logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn))
            logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn))
            logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn))
            ln(self.good_fuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_fuzzy_gff_fn, self.gff_fn)
            ln(self.fuzzy_group_fn, self.group_fn)
        else:
            logging.info("No need to further collapse fuzzy junctions.")
            ln(self.good_unfuzzy_gff_fn, self.good_gff_fn)
            ln(self.good_unfuzzy_gff_fn, self.gff_fn)
            ln(self.unfuzzy_group_fn, self.group_fn)

        # Pick up representative
        logging.info("Picking up representative record.")
        pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest

        pick_rep(isoform_filename=self.isoform_filename,
                 gff_filename=self.good_gff_fn,
                 group_filename=self.group_fn,
                 output_filename=self.rep_fn(self.suffix),
                 pick_least_err_instead=pick_least_err_instead,
                 bad_gff_filename=self.bad_gff_fn)

        logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn))
        logging.info("Output GFF written to: %s", realpath(self.gff_fn))
        logging.info("Output Group TXT written to: %s", realpath(self.group_fn))
        logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix)))
        logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
Esempio n. 53
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)