Beispiel #1
0
    def __init__(self,
                 folder,
                 organism,
                 alpha=0.05,
                 log2_fc=0,
                 progress=True,
                 mapper=None,
                 background=None):

        print("DRAFT in progress")
        from bioservices import KEGG
        self.kegg = KEGG(cache=True)
        self.kegg.organism = organism

        self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc)
        # some clean up
        if "ID" in self.rnadiff.df.columns:
            self.rnadiff.df['ID'] = [
                x.replace("gene:", "") for x in self.rnadiff.df['ID']
            ]
        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]
        for key, values in self.rnadiff.gene_lists.items():
            self.rnadiff.gene_lists[key] = [
                x.replace("gene:", "") for x in values
            ]

        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]

        choices = list(self.rnadiff.gene_lists.keys())

        if background:
            self.background = background
        else:
            self.background = len(
                self.kegg.list(self.kegg.organism).split("\n"))
        logger.info("Set number of genes to {}".format(self.background))

        self._load_pathways(progress=progress)

        self.mapper = mapper

        try:
            self.compute_enrichment()
        except Exception:
            logger.critical("An error occured while computing enrichments")
            pass
Beispiel #2
0
    def check_input_files(self, stop_on_error=True):
        # Sanity checks
        cfg = self.config.config
        filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern)
        logger.info("Found {} files matching your input  pattern ({})".format(
            len(filenames), cfg.input_pattern))

        if len(filenames) == 0:
            logger.critical(
                "Found no files with your matching pattern ({})".format(
                    cfg.input_pattern))
            if "*" not in cfg.input_pattern and "?" not in cfg.input_pattern:
                logger.critical(
                    "No wildcard used in your input pattern, please use a * or ? character"
                )
            if stop_on_error:
                sys.exit(1)

        from sequana import FastQFactory
        try:
            ff = FastQFactory(cfg.input_directory + os.sep + cfg.input_pattern,
                              read_tag=cfg.input_readtag)

            # This tells whether the data is paired or not
            if ff.paired:
                paired = "paired reads"
            else:
                paired = "single-end reads"
            logger.info(
                "Your input data seems to be made of {}".format(paired))

        except:
            logger.error(
                """Input data is not fastq-compatible with sequana pipelines. You may want to set the read_tag to empty string or None if you wish
to analyse non-fastQ files (e.g. BAM)""")
            sys.exit(1)
Beispiel #3
0
def main(args=None):
    """Mostly checking the options provided by the user and then call
    :func:`sequana_init` function to create the pre-filled config file +
    snakemake + README +runme.sh in a dedicated project directory.

    """
    import sequana
    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        sa = Tools()
        sa.purple("Welcome to Sequana standalone application")
        logger.critical("You must use --pipeline <valid pipeline name>\nuse "
                        "--show-pipelines or --help for more information. ")
        return
    else:
        options = user_options.parse_args(args[1:])

    # these imports must be local. This also speed up the --help

    sa = Tools(verbose=options.verbose)
    sa.purple("Welcome to Sequana standalone application")

    # Those options are mutually exclusive
    flag = int(
        "%s%s%s%s%s%s" %
        (int(bool(options.issue)), int(bool(options.version)),
         int(bool(options.info)), int(bool(options.show_pipelines)),
         int(bool(options.pipeline)), int(bool(options.get_config))), 2)
    if flag not in [1, 2, 4, 8, 16, 3, 32]:
        logger.critical("You must use one of --pipeline, --info, "
                        "--show-pipelines, --issue, --version, --get-config")
        sys.exit(1)

    # OPTIONS that gives info and exit
    if options.issue:
        onweb('https://github.com/sequana/sequana/issues')
        return

    if options.version:
        sa.purple("Sequana version %s" % sequana.version)
        return

    if options.show_pipelines:
        sa.purple("Valid pipeline names:")
        for this in sorted(valid_pipelines):
            m = Module(this)
            sa.green(" - " + this)
            print(textwrap(m.overview, indent=8))
        return

    if options.info:
        module = Module(options.info)
        module.onweb()
        return

    if options.pipeline:
        # check validity of the pipeline name
        if options.pipeline not in valid_pipelines:
            txt = "".join([" - %s\n" % this for this in valid_pipelines])
            logger.critical("%s not a valid pipeline name. Use of one:\n" %
                            options.pipeline + txt)
            sys.exit(1)

    # copy locally the request config file from a specific pipeline
    if flag == 3:  #--get-config and --pipeline used
        module = Module(options.pipeline)
        copy_config_from_sequana(module)
        return

    # pipeline should be defined by now. Let us start the real work here
    Module("dag").check("warning")
    Module(options.pipeline).check("warning")

    # If user provides file1 and/or file2, check the files exist
    if options.file1 and os.path.exists(options.file1) is False:
        raise ValueError("%s does not exist" % options.file1)

    if options.file2 and os.path.exists(options.file2) is False:
        raise ValueError("%s does not exist" % options.file2)

    if options.kraken and os.path.exists(options.kraken) is False:
        raise ValueError("%s does not exist" % options.kraken)

    if options.input_directory and os.path.exists(
            options.input_directory) is False:
        raise ValueError("%s does not exist" % options.input_directory)

    # check valid combo of arguments
    flag = int(
        "%s%s%s%s%s" % (
            int(bool(options.pattern)),
            int(bool(options.input_directory)),
            int(bool(options.file1)),
            int(bool(options.file2)),
            int(bool(options.config)),
        ), 2)

    # config file has flag 1, others have flag 2,4,8,16
    # config file alone : 1
    # --input-directory alone: 2
    # --file1 alone: 4
    # --file1 + --file2 : 2+4=6
    # --input-pattern alone: 16
    # none of those options redirect to input_directory=local
    if flag not in [0, 1, 2, 4, 6, 8, 16]:
        logger.critical(help_input + "\n\nUse --help for more information")
        sys.exit(1)

    assert options.extension in ["fastq", "fq", "fastq.gz", "fq.gz", "bam"]

    # Note that we use abspath to make it more robust and easier to debug
    # If no options, we use input_directory and set it to "."
    if flag == 0 or options.input_directory:
        if flag == 0:
            options.input_directory = "."
        options.input_directory = os.path.abspath(options.input_directory)
        data = options.input_directory + os.sep + "*" + options.extension
        options.file1 = ""
        options.file2 = ""
        options.pattern = ""
        if options.verbose:
            logger.info("Looking for sample files matching %s" % data)
    elif options.pattern:
        options.pattern = os.path.abspath(options.pattern)
        data = os.path.abspath(options.pattern)
        options.input_directory = ""
        options.extension = ""
        options.file1 = ""
        options.file2 = ""
    elif options.config:
        pass
    elif options.file1:
        data = [options.file1]
        options.file1 = os.path.abspath(options.file1)
        if options.file2:
            data = [options.file2]
            options.file2 = os.path.abspath(options.file2)
        options.input_directory = ""
        options.pattern = ""
        options.extension = ""

    if options.extension == 'bam' or options.pattern.endswith('bam') or \
            options.pattern.endswith('bed'):

        ff = FileFactory(data)
    else:
        ff = FastQFactory(data,
                          read_tag=options.input_readtag,
                          verbose=options.verbose)

    if options.pipeline == 'quality_control' or options.pipeline == 'rnaseq':
        # check combo
        flag = int(
            "%s%s%s%s%s" %
            (int(bool(options.no_adapters)), int(bool(options.design)),
             int(bool(options.adapters)), int(bool(
                 options.adapter_fwd)), int(bool(options.adapter_rev))), 2)

        if flag not in [16, 12, 6, 4, 2, 3]:
            logger.critical(
                "You must use a design experimental file using --design"
                " and --adapters to indicate the type of adapters (PCRFree"
                " or Nextera), or provide the adapters directly as a "
                " string (or a file) using --adapter_fwd (AND --adapter_"
                "rev for paired-end data). A third way is to set --adapters"
                " to either Nextera, PCRFree, Rubicon or universal in which case "
                " all adapters will be used (slower). Finally, you may use "
                " --no-adapters for testing purpose or if you know there "
                " is no adapters")
            sys.exit(1)

        # flag 12 (design + adapters when wrong args provided)
        if options.design and options.adapters not in adapters_choice:
            raise ValueError(
                "When using --design, you must also "
                "provide the type of adapters using --adapters (set to "
                "one of %s )" % adapters_choice)
        if options.design and options.adapters:
            from sequana import FindAdaptersFromDesign
            fa = FindAdaptersFromDesign(options.design, options.adapters)
            fa.check()

        # flag 12 (design + adapters with correct args)
        elif options.design and options.adapters in adapters_choice:
            options.adapters_fwd = options.adapters
            options.adapters_rev = options.adapters
        elif options.no_adapters:
            options.adapter_fwd = "XXXX"
            options.adapter_rev = "XXXX"
        else:
            if options.adapter_fwd is None:
                if options.adapters not in ["universal"] + adapters_choice:
                    msg = "Incorrect adapter choice %s. " % options.adapters
                    msg += "Correct values are :\n"
                    for this in ['universal'] + adapters_choice:
                        msg += " - {}\n ".format(this)
                    logger.error(msg)
                    raise ValueError
                # flag 4
                if options.adapters == "universal":
                    options.adapter_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC"
                    options.adapter_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA"
                # flag 4
                else:
                    # Let the pipeline handle the names
                    options.adapter_fwd = options.adapters
                    options.adapter_rev = options.adapters
            # flag 2/3
            else:
                if options.adapter_fwd:
                    # Could be a string or a file. If a file, check the format
                    if os.path.exists(options.adapter_fwd):
                        AdapterReader(options.adapter_fwd)
                        options.adapter_fwd = "file:%s" % options.adapter_fwd
                if options.adapter_rev:
                    # Could be a string or a file. If a file, check the format
                    if os.path.exists(options.adapter_rev):
                        AdapterReader(options.adapter_rev)
                        options.adapter_rev = "file:%s" % options.adapter_rev
        if options.design:
            # Just check the format
            adapter_finder = FindAdaptersFromDesign(options.design,
                                                    options.adapters)

    # If all options are valid, we can now create the tree structure
    sequana_init(options)
Beispiel #4
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.level

    if options.update_taxonomy is True:
        from sequana.taxonomy import Taxonomy
        tax = Taxonomy()
        from sequana import sequana_config_path as cfg
        logger.info(
            "Will overwrite the local database taxonomy.dat in {}".format(cfg))
        tax.download_taxonomic_file(overwrite=True)
        sys.exit(0)

    # We put the import here to make the --help faster
    from sequana import KrakenPipeline
    from sequana.kraken import KrakenSequential
    devtools = DevTools()

    if options.download:
        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download(options.download)
        sys.exit()

    fastq = []
    if options.file1:
        devtools.check_exists(options.file1)
        fastq.append(options.file1)
    if options.file2:
        devtools.check_exists(options.file2)
        fastq.append(options.file2)

    from sequana import sequana_config_path as scfg
    if options.databases is None:
        logger.critical("You must provide a database")
        sys.exit(1)

    databases = []
    for database in options.databases:
        if database == "toydb":
            database = "kraken_toydb"
        elif database == "minikraken":
            database = "minikraken_20141208"

        if os.path.exists(scfg + os.sep + database):  # in Sequana path
            databases.append(scfg + os.sep + database)
        elif os.path.exists(database):  # local database
            databases.append(database)
        else:
            msg = "Invalid database name (%s). Neither found locally "
            msg += "or in the sequana path %s; Use the --download option"
            raise ValueError(msg % (database, scfg))

    output_directory = options.directory + os.sep + "kraken"
    devtools.mkdirs(output_directory)

    # if there is only one database, use the pipeline else KrakenHierarchical
    _pathto = lambda x: "{}/kraken/{}".format(options.directory, x) if x else x
    if len(databases) == 1:
        logger.info("Using 1 database")
        k = KrakenPipeline(fastq,
                           databases[0],
                           threads=options.thread,
                           output_directory=output_directory,
                           confidence=options.confidence)

        k.run(output_filename_classified=_pathto(options.classified_out),
              output_filename_unclassified=_pathto(options.unclassified_out))
    else:
        logger.info("Using %s databases" % len(databases))
        k = KrakenSequential(fastq,
                             databases,
                             threads=options.thread,
                             output_directory=output_directory + os.sep,
                             force=True,
                             keep_temp_files=options.keep_temp_files,
                             output_filename_unclassified=_pathto(
                                 options.unclassified_out),
                             confidence=options.confidence)
        k.run(output_prefix="kraken")

    # This statements sets the directory where HTML will be saved
    from sequana.utils import config
    config.output_dir = options.directory

    # output_directory first argument: the directory where to find the data
    # output_filename is relative to the config.output_dir defined above
    kk = KrakenModule(output_directory, output_filename="summary.html")

    logger.info("Open ./%s/summary.html" % options.directory)
    logger.info("or ./%s/kraken/kraken.html" % options.directory)

    if options.html is True:
        ss.onweb()
Beispiel #5
0
    def check_options(self, options):
        """
        """
        design = options.cutadapt_design_file
        adapter_choice = options.cutadapt_adapter_choice
        adapter_fwd = options.cutadapt_fwd
        adapter_rev = options.cutadapt_rev

        if design:
            if adapter_fwd or adapter_rev:
                logger.critical(
                    "When using --cutadapt-design-file, one must not"
                    " set the forward/reverse adapters with --cutadapt-fwd"
                    " and/or --cutadapt-rev\n\n" + self.description)
                sys.exit(1)

            # otherwise, we just check the format but we need the adapter choice
            if options.cutadapt_adapter_choice in [None, 'none']:
                logger.critical(
                    "When using --cutadapt-design-file, you must also"
                    " provide the type of adapters using --cutadapt-adapter-choice"
                    " (set to one of %s )" % self.adapters_choice)
                sys.exit(1)
            from sequana import FindAdaptersFromDesign
            fa = FindAdaptersFromDesign(design,
                                        options.cutadapt_adapter_choice)
            try:
                fa.check()
            except:
                logger.critical("Your design file contains indexes not found "
                                "in the list of adapters from {}".format(
                                    options.cutadapt_adapter_choice))
                sys.exit(1)

        # No design provided here below
        # do we need to remove adapters at all ?
        elif options.cutadapt_adapter_choice == "none":
            options.cutadapt_adapter_choice = None
            options.cutadapt_fwd = "XXXX"
            options.cutadapt_rev = "XXXX"
        # or just the universal ones ?
        elif options.cutadapt_adapter_choice == "universal":
            options.cutadapt_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC"
            options.cutadapt_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA"
        # or do we have a string or files provided for the fwd/rev ?
        elif options.cutadapt_adapter_choice is None:
            if options.cutadapt_fwd:
                # Could be a string or a file. If a file, check the format
                if os.path.exists(options.cutadapt_fwd):
                    AdapterReader(options.cutadapt_fwd)
                    options.cutadapt_fwd = "file:{}".format(
                        os.path.abspath(options.cutadapt_fwd))
            if options.cutadapt_rev:
                # Could be a string or a file. If a file, check the format
                if os.path.exists(options.cutadapt_rev):
                    AdapterReader(options.cutadapt_rev)
                    options.cutadapt_rev = "file:{}".format(
                        os.path.abspath(options.cutadapt_rev))
        elif options.cutadapt_adapter_choice:
            # nothing to do, the cutadapt rules from sequana will use
            # the adapter_choice, and fill the fwd/rev automatically
            pass
Beispiel #6
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # ============================================== sanity checks
    if not os.path.exists(options.samplesheet):
        logger.error(f"{options.samplesheet} file does not exists")
        sys.exit(1)

    if not os.path.exists(options.bcl_directory):
        logger.error(f"{options.bcl_directory} file does not exists")
        sys.exit(1)

    # Check the sample sheet
    from sequana import iem
    try:
        samplesheet = iem.IEM(options.samplesheet)
        samplesheet.validate()
    except Exception as err:
        logger.critical(err)
        logger.critical(
            """Your sample sheet seems to be incorrect. Before running the pipeline
you will have to fix it. You may use 'sequana samplesheet --quick-fix'""")

    # NextSeq
    runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml"

    # HiSeq
    runparam_2 = options.bcl_directory + os.sep + "runParameters.xml"

    if os.path.exists(runparam_1):
        runparam = runparam_1
    elif os.path.exists(runparam_2):
        runparam = runparam_2
    else:
        runparam = None
        logger.warning("RunParameters.xml or runParameters.xml file not found")

    if runparam:
        with open(runparam, "r") as fin:
            data = fin.read()
            if "NextSeq" in data and options.merging_strategy != "merge":
                if options.merging_strategy == "none_and_force":
                    msg = "This is a NextSeq. You set the --merging-strategy to"
                    msg += " none_and_force. So, we proceed with no merging strategy"
                    logger.warning(msg)
                if options.merging_strategy == "none":
                    msg = "This is a NextSeq run. You must set the "
                    msg += " --merging-strategy to 'merge'."
                    logger.warning(msg)
                    sys.exit(1)

    if options.from_project is None:
        cfg = manager.config.config
        cfg.general.input_directory = os.path.abspath(options.bcl_directory)
        cfg.bcl2fastq.threads = options.threads
        cfg.bcl2fastq.barcode_mismatch = options.mismatch
        cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet)

        from sequana.iem import IEM
        ss = IEM(cfg.bcl2fastq.samplesheet_file)
        ss.validate()

        # this is defined by the working_directory
        #cfg.bcl2fastq.output_directory = "."
        cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls
        cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression

        if options.merging_strategy == "merge":
            cfg.bcl2fastq.merge_all_lanes = True
        elif options.merging_strategy in ["none", "none_and_force"]:
            cfg.bcl2fastq.merge_all_lanes = False

        #
        if options.mars_seq:
            cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 "
            if options.merging_strategy in ["merge"]:
                logger.warning(
                    "with --mars-seq option, the merging strategy should be none_and_force"
                )
                cfg.bcl2fastq.merge_all_lanes = False

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown(check_input_files=False)

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Beispiel #7
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    from sequana.pipelines_common import SequanaManager

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # Do we need the indexing ?
        if options.aligner == "bowtie2":
            if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"):
                logger.info("Indexing found for {}.".format("bowtie2"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie2"))
                cfg.general.indexing = True
        elif options.aligner == "star":
            if os.path.exists(prefix + f"/star/SAindex"):
                logger.info("Indexing found for {}.".format("STAR"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "STAR"))
                cfg.general.indexing = True
        elif options.aligner == "bowtie1":
            if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"):
                logger.info("Indexing found for {}.".format("bowtie1"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie1"))
                cfg.general.indexing = True
        elif options.aligner == "salmon":
            if os.path.exists(cfg.general.genome_directory +
                              "/salmon/salmon.done"):
                logger.info("Indexing found for {}.".format("salmon"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "salmon"))
                cfg.general.indexing = True

        #options.do_indexing
        cfg.general.force_indexing = options.force_indexing
        cfg.general.rRNA_feature = options.rRNA_feature
        cfg.general.contaminant_file = options.contaminant_file

        if options.rRNA_feature and options.contaminant_file:
            logger.warning(
                "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file"
            )
            sys.exit(1)

        # --------------------------------------------------------- cutadapt
        cfg.cutadapt.do = not options.skip_cutadapt
        manager.update_config(cfg, options, "cutadapt")

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc
        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        # -------------------------------------------------------- RNAdiff
        cfg.rnadiff.mode = options.rnadiff_mode

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3
            genome_directory = os.path.abspath(
                cfg["general"]["genome_directory"])
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"
            gff = GFF3(gff_file)
            df_gff = gff.get_df()
            valid_types = gff.get_types()

            # first check the rRNA feature
            if cfg['general']["rRNA_feature"] and \
                cfg['general']["rRNA_feature"] not in valid_types:

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(valid_types))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "checking your input GFF file and feature counts options")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(',')
                gff.save_gff_filtered(features=fc_types, filename='custom.gff')
                cfg.general.custom_gff = 'custom.gff'

            for fc_type in fc_types:
                S = sum(df_gff['type'] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_types))
                    sys.exit()
                else:
                    logger.info("Found {} {} entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} {} entries for attribute '{}' [{} unique entries]"
                        .format(S, fc_attr, fc_type, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Beispiel #8
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline

    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger

    logger.setLevel(options.level)
    logger.name = "sequana_rnaseq"
    logger.info(f"#Welcome to sequana_rnaseq pipeline.")

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # mutually exclusive options
        if options.contaminant_file:
            cfg.general.contaminant_file = os.path.abspath(
                options.contaminant_file)
            logger.warning(
                "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored"
            )
            cfg.general.rRNA_feature = None
        else:
            cfg.general.rRNA_feature = options.rRNA_feature

        # --------------------------------------------------------- trimming
        cfg.trimming.software_choice = options.trimming_software_choice
        cfg.trimming.do = not options.disable_trimming
        qual = options.trimming_quality

        if options.trimming_software_choice in ["cutadapt", "atropos"]:
            cfg.cutadapt.tool_choice = options.trimming_software_choice
            cfg.cutadapt.fwd = options.trimming_adapter_read1
            cfg.cutadapt.rev = options.trimming_adapter_read2
            cfg.cutadapt.m = options.trimming_minimum_length
            cfg.cutadapt.mode = options.trimming_cutadapt_mode
            cfg.cutadapt.options = options.trimming_cutadapt_options  # trim Ns -O 6
            cfg.cutadapt.quality = 30 if qual == -1 else qual
        else:
            cfg.fastp.minimum_length = options.trimming_minimum_length
            cfg.fastp.quality = 15 if qual == -1 else qual
            cfg.fastp.fwd = options.trimming_adapter_read1
            cfg.fastp.rev = options.trimming_adapter_read2
            cfg.fastp.options = " --cut_tail "
            cfg.fastp.disable_quality_filtering = False
            cfg.fastp.disable_adapter_trimming = False

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc

        if options.do_rnaseqc:
            if options.rnaseqc_gtf_file is None:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no GTF"
                    " file provided; Please use --rnaseqc-gtf-file option. Switching off in your"
                    " config file and continuing. You may use 'sequana gff2gtf input.gff' to create"
                    " the gtf file")
                cfg.rnaseqc.do = False
            if options.aligner in ["salmon"]:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no"
                    " BAM will be generated by the salmon aligner. Switching off this option. "
                )
                cfg.rnaseqc.do = False

        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        cfg.rseqc.do = options.do_rseqc
        cfg.rseqc.bed_file = options.rseqc_bed_file
        # -------------------------------------------------------- RNAdiff

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "Checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3

            genome_directory = os.path.abspath(cfg.general.genome_directory)
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"

            gff = GFF3(gff_file)
            df_gff = gff.df  # This takes one minute on eukaryotes. No need to
            valid_features = gff.features  # about 3 seconds
            valid_attributes = gff.attributes  # about 10 seconds

            # first check the rRNA feature
            if (cfg["general"]["rRNA_feature"]
                    and cfg["general"]["rRNA_feature"] not in valid_features):

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(
                        valid_features))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "Checking your input GFF file and feature counts options.")
            logger.info(
                f"You chose '{fc_type}' feature and '{fc_attr}' attribute")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(",")
                gff.save_gff_filtered(features=fc_types, filename="custom.gff")
                cfg.general.custom_gff = "custom.gff"

            for fc_type in fc_types:
                S = sum(df_gff["genetic_type"] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_features))
                    sys.exit()
                else:
                    logger.info("Found {} '{}' entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("genetic_type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} '{}' entries for the attribute [{} unique entries]"
                        .format(S, fc_attr, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)