Example #1
0
    def __init__(self,
                 pattern="**/summary.json",
                 output_filename=None,
                 verbose=True,
                 **kargs):
        super().__init__()

        from sequana import logger
        logger.setLevel("INFO")
        if verbose is False:
            logger.setLevel("WARNING")

        logger.info(
            "Sequana Summary is still a tool in progress and have been " +
            "  tested with the quality_control pipeline only for now.")
        self.title = "Sequana multiple summary"
        self.devtools = DevTools()

        self.filenames = list(glob.iglob(pattern, recursive=True))
        self.summaries = [ReadSummary(filename) for filename in self.filenames]
        self.projects = [
            ReadSummary(filename).data['project']
            for filename in self.filenames
        ]
        self.create_report_content()
        self.create_html(output_filename)
Example #2
0
def main(args=None):
    if args is None:
        args = sys.argv[:]

    print(purple("Welcome to sequana_substractor"))
    print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL"))
    user_options = Options(prog="sequana_substractor")
    if len(args) == 1:
        args.append("--help")

    if "--version" in sys.argv:
        import sequana
        print(sequana.version)
        sys.exit(0)

    options = user_options.parse_args(args[1:])
    logger.setLevel(options.level)

    # build the references list
    references = []
    if options.reference:
        references.append(options.reference)
    if options.references:
        references = options.references
    options.references = references

    references = []
    # expand globs if any
    for ref in options.references:
        references.extend(glob.glob(ref))

    logger.info("{} references provided: {}".format(len(references),
                                                    ",".join(references)))

    # call the entire machinery here
    sub = Substractor(options.input, references, options.outdir,
                      options.mapper, options.threads)
    sub.run(options.outfile)
Example #3
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # ============================================== sanity checks
    if not os.path.exists(options.samplesheet):
        logger.error(f"{options.samplesheet} file does not exists")
        sys.exit(1)

    if not os.path.exists(options.bcl_directory):
        logger.error(f"{options.bcl_directory} file does not exists")
        sys.exit(1)

    # Check the sample sheet
    from sequana import iem
    try:
        samplesheet = iem.IEM(options.samplesheet)
        samplesheet.validate()
    except Exception as err:
        logger.critical(err)
        logger.critical(
            """Your sample sheet seems to be incorrect. Before running the pipeline
you will have to fix it. You may use 'sequana samplesheet --quick-fix'""")

    # NextSeq
    runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml"

    # HiSeq
    runparam_2 = options.bcl_directory + os.sep + "runParameters.xml"

    if os.path.exists(runparam_1):
        runparam = runparam_1
    elif os.path.exists(runparam_2):
        runparam = runparam_2
    else:
        runparam = None
        logger.warning("RunParameters.xml or runParameters.xml file not found")

    if runparam:
        with open(runparam, "r") as fin:
            data = fin.read()
            if "NextSeq" in data and options.merging_strategy != "merge":
                if options.merging_strategy == "none_and_force":
                    msg = "This is a NextSeq. You set the --merging-strategy to"
                    msg += " none_and_force. So, we proceed with no merging strategy"
                    logger.warning(msg)
                if options.merging_strategy == "none":
                    msg = "This is a NextSeq run. You must set the "
                    msg += " --merging-strategy to 'merge'."
                    logger.warning(msg)
                    sys.exit(1)

    if options.from_project is None:
        cfg = manager.config.config
        cfg.general.input_directory = os.path.abspath(options.bcl_directory)
        cfg.bcl2fastq.threads = options.threads
        cfg.bcl2fastq.barcode_mismatch = options.mismatch
        cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet)

        from sequana.iem import IEM
        ss = IEM(cfg.bcl2fastq.samplesheet_file)
        ss.validate()

        # this is defined by the working_directory
        #cfg.bcl2fastq.output_directory = "."
        cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls
        cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression

        if options.merging_strategy == "merge":
            cfg.bcl2fastq.merge_all_lanes = True
        elif options.merging_strategy in ["none", "none_and_force"]:
            cfg.bcl2fastq.merge_all_lanes = False

        #
        if options.mars_seq:
            cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 "
            if options.merging_strategy in ["merge"]:
                logger.warning(
                    "with --mars-seq option, the merging strategy should be none_and_force"
                )
                cfg.bcl2fastq.merge_all_lanes = False

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown(check_input_files=False)

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Example #4
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    from sequana.pipelines_common import SequanaManager

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # Do we need the indexing ?
        if options.aligner == "bowtie2":
            if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"):
                logger.info("Indexing found for {}.".format("bowtie2"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie2"))
                cfg.general.indexing = True
        elif options.aligner == "star":
            if os.path.exists(prefix + f"/star/SAindex"):
                logger.info("Indexing found for {}.".format("STAR"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "STAR"))
                cfg.general.indexing = True
        elif options.aligner == "bowtie1":
            if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"):
                logger.info("Indexing found for {}.".format("bowtie1"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie1"))
                cfg.general.indexing = True
        elif options.aligner == "salmon":
            if os.path.exists(cfg.general.genome_directory +
                              "/salmon/salmon.done"):
                logger.info("Indexing found for {}.".format("salmon"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "salmon"))
                cfg.general.indexing = True

        #options.do_indexing
        cfg.general.force_indexing = options.force_indexing
        cfg.general.rRNA_feature = options.rRNA_feature
        cfg.general.contaminant_file = options.contaminant_file

        if options.rRNA_feature and options.contaminant_file:
            logger.warning(
                "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file"
            )
            sys.exit(1)

        # --------------------------------------------------------- cutadapt
        cfg.cutadapt.do = not options.skip_cutadapt
        manager.update_config(cfg, options, "cutadapt")

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc
        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        # -------------------------------------------------------- RNAdiff
        cfg.rnadiff.mode = options.rnadiff_mode

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3
            genome_directory = os.path.abspath(
                cfg["general"]["genome_directory"])
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"
            gff = GFF3(gff_file)
            df_gff = gff.get_df()
            valid_types = gff.get_types()

            # first check the rRNA feature
            if cfg['general']["rRNA_feature"] and \
                cfg['general']["rRNA_feature"] not in valid_types:

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(valid_types))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "checking your input GFF file and feature counts options")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(',')
                gff.save_gff_filtered(features=fc_types, filename='custom.gff')
                cfg.general.custom_gff = 'custom.gff'

            for fc_type in fc_types:
                S = sum(df_gff['type'] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_types))
                    sys.exit()
                else:
                    logger.info("Found {} {} entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} {} entries for attribute '{}' [{} unique entries]"
                        .format(S, fc_attr, fc_type, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Example #5
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline

    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger

    logger.setLevel(options.level)
    logger.name = "sequana_rnaseq"
    logger.info(f"#Welcome to sequana_rnaseq pipeline.")

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # mutually exclusive options
        if options.contaminant_file:
            cfg.general.contaminant_file = os.path.abspath(
                options.contaminant_file)
            logger.warning(
                "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored"
            )
            cfg.general.rRNA_feature = None
        else:
            cfg.general.rRNA_feature = options.rRNA_feature

        # --------------------------------------------------------- trimming
        cfg.trimming.software_choice = options.trimming_software_choice
        cfg.trimming.do = not options.disable_trimming
        qual = options.trimming_quality

        if options.trimming_software_choice in ["cutadapt", "atropos"]:
            cfg.cutadapt.tool_choice = options.trimming_software_choice
            cfg.cutadapt.fwd = options.trimming_adapter_read1
            cfg.cutadapt.rev = options.trimming_adapter_read2
            cfg.cutadapt.m = options.trimming_minimum_length
            cfg.cutadapt.mode = options.trimming_cutadapt_mode
            cfg.cutadapt.options = options.trimming_cutadapt_options  # trim Ns -O 6
            cfg.cutadapt.quality = 30 if qual == -1 else qual
        else:
            cfg.fastp.minimum_length = options.trimming_minimum_length
            cfg.fastp.quality = 15 if qual == -1 else qual
            cfg.fastp.fwd = options.trimming_adapter_read1
            cfg.fastp.rev = options.trimming_adapter_read2
            cfg.fastp.options = " --cut_tail "
            cfg.fastp.disable_quality_filtering = False
            cfg.fastp.disable_adapter_trimming = False

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc

        if options.do_rnaseqc:
            if options.rnaseqc_gtf_file is None:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no GTF"
                    " file provided; Please use --rnaseqc-gtf-file option. Switching off in your"
                    " config file and continuing. You may use 'sequana gff2gtf input.gff' to create"
                    " the gtf file")
                cfg.rnaseqc.do = False
            if options.aligner in ["salmon"]:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no"
                    " BAM will be generated by the salmon aligner. Switching off this option. "
                )
                cfg.rnaseqc.do = False

        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        cfg.rseqc.do = options.do_rseqc
        cfg.rseqc.bed_file = options.rseqc_bed_file
        # -------------------------------------------------------- RNAdiff

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "Checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3

            genome_directory = os.path.abspath(cfg.general.genome_directory)
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"

            gff = GFF3(gff_file)
            df_gff = gff.df  # This takes one minute on eukaryotes. No need to
            valid_features = gff.features  # about 3 seconds
            valid_attributes = gff.attributes  # about 10 seconds

            # first check the rRNA feature
            if (cfg["general"]["rRNA_feature"]
                    and cfg["general"]["rRNA_feature"] not in valid_features):

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(
                        valid_features))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "Checking your input GFF file and feature counts options.")
            logger.info(
                f"You chose '{fc_type}' feature and '{fc_attr}' attribute")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(",")
                gff.save_gff_filtered(features=fc_types, filename="custom.gff")
                cfg.general.custom_gff = "custom.gff"

            for fc_type in fc_types:
                S = sum(df_gff["genetic_type"] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_features))
                    sys.exit()
                else:
                    logger.info("Found {} '{}' entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("genetic_type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} '{}' entries for the attribute [{} unique entries]"
                        .format(S, fc_attr, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)