def __init__(self, folder, organism, alpha=0.05, log2_fc=0, progress=True, mapper=None, background=None): print("DRAFT in progress") from bioservices import KEGG self.kegg = KEGG(cache=True) self.kegg.organism = organism self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc) # some clean up if "ID" in self.rnadiff.df.columns: self.rnadiff.df['ID'] = [ x.replace("gene:", "") for x in self.rnadiff.df['ID'] ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] for key, values in self.rnadiff.gene_lists.items(): self.rnadiff.gene_lists[key] = [ x.replace("gene:", "") for x in values ] self.rnadiff.df.index = [ x.replace("gene:", "") for x in self.rnadiff.df.index ] choices = list(self.rnadiff.gene_lists.keys()) if background: self.background = background else: self.background = len( self.kegg.list(self.kegg.organism).split("\n")) logger.info("Set number of genes to {}".format(self.background)) self._load_pathways(progress=progress) self.mapper = mapper try: self.compute_enrichment() except Exception: logger.critical("An error occured while computing enrichments") pass
def check_input_files(self, stop_on_error=True): # Sanity checks cfg = self.config.config filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern) logger.info("Found {} files matching your input pattern ({})".format( len(filenames), cfg.input_pattern)) if len(filenames) == 0: logger.critical( "Found no files with your matching pattern ({})".format( cfg.input_pattern)) if "*" not in cfg.input_pattern and "?" not in cfg.input_pattern: logger.critical( "No wildcard used in your input pattern, please use a * or ? character" ) if stop_on_error: sys.exit(1) from sequana import FastQFactory try: ff = FastQFactory(cfg.input_directory + os.sep + cfg.input_pattern, read_tag=cfg.input_readtag) # This tells whether the data is paired or not if ff.paired: paired = "paired reads" else: paired = "single-end reads" logger.info( "Your input data seems to be made of {}".format(paired)) except: logger.error( """Input data is not fastq-compatible with sequana pipelines. You may want to set the read_tag to empty string or None if you wish to analyse non-fastQ files (e.g. BAM)""") sys.exit(1)
def main(args=None): """Mostly checking the options provided by the user and then call :func:`sequana_init` function to create the pre-filled config file + snakemake + README +runme.sh in a dedicated project directory. """ import sequana if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: sa = Tools() sa.purple("Welcome to Sequana standalone application") logger.critical("You must use --pipeline <valid pipeline name>\nuse " "--show-pipelines or --help for more information. ") return else: options = user_options.parse_args(args[1:]) # these imports must be local. This also speed up the --help sa = Tools(verbose=options.verbose) sa.purple("Welcome to Sequana standalone application") # Those options are mutually exclusive flag = int( "%s%s%s%s%s%s" % (int(bool(options.issue)), int(bool(options.version)), int(bool(options.info)), int(bool(options.show_pipelines)), int(bool(options.pipeline)), int(bool(options.get_config))), 2) if flag not in [1, 2, 4, 8, 16, 3, 32]: logger.critical("You must use one of --pipeline, --info, " "--show-pipelines, --issue, --version, --get-config") sys.exit(1) # OPTIONS that gives info and exit if options.issue: onweb('https://github.com/sequana/sequana/issues') return if options.version: sa.purple("Sequana version %s" % sequana.version) return if options.show_pipelines: sa.purple("Valid pipeline names:") for this in sorted(valid_pipelines): m = Module(this) sa.green(" - " + this) print(textwrap(m.overview, indent=8)) return if options.info: module = Module(options.info) module.onweb() return if options.pipeline: # check validity of the pipeline name if options.pipeline not in valid_pipelines: txt = "".join([" - %s\n" % this for this in valid_pipelines]) logger.critical("%s not a valid pipeline name. Use of one:\n" % options.pipeline + txt) sys.exit(1) # copy locally the request config file from a specific pipeline if flag == 3: #--get-config and --pipeline used module = Module(options.pipeline) copy_config_from_sequana(module) return # pipeline should be defined by now. Let us start the real work here Module("dag").check("warning") Module(options.pipeline).check("warning") # If user provides file1 and/or file2, check the files exist if options.file1 and os.path.exists(options.file1) is False: raise ValueError("%s does not exist" % options.file1) if options.file2 and os.path.exists(options.file2) is False: raise ValueError("%s does not exist" % options.file2) if options.kraken and os.path.exists(options.kraken) is False: raise ValueError("%s does not exist" % options.kraken) if options.input_directory and os.path.exists( options.input_directory) is False: raise ValueError("%s does not exist" % options.input_directory) # check valid combo of arguments flag = int( "%s%s%s%s%s" % ( int(bool(options.pattern)), int(bool(options.input_directory)), int(bool(options.file1)), int(bool(options.file2)), int(bool(options.config)), ), 2) # config file has flag 1, others have flag 2,4,8,16 # config file alone : 1 # --input-directory alone: 2 # --file1 alone: 4 # --file1 + --file2 : 2+4=6 # --input-pattern alone: 16 # none of those options redirect to input_directory=local if flag not in [0, 1, 2, 4, 6, 8, 16]: logger.critical(help_input + "\n\nUse --help for more information") sys.exit(1) assert options.extension in ["fastq", "fq", "fastq.gz", "fq.gz", "bam"] # Note that we use abspath to make it more robust and easier to debug # If no options, we use input_directory and set it to "." if flag == 0 or options.input_directory: if flag == 0: options.input_directory = "." options.input_directory = os.path.abspath(options.input_directory) data = options.input_directory + os.sep + "*" + options.extension options.file1 = "" options.file2 = "" options.pattern = "" if options.verbose: logger.info("Looking for sample files matching %s" % data) elif options.pattern: options.pattern = os.path.abspath(options.pattern) data = os.path.abspath(options.pattern) options.input_directory = "" options.extension = "" options.file1 = "" options.file2 = "" elif options.config: pass elif options.file1: data = [options.file1] options.file1 = os.path.abspath(options.file1) if options.file2: data = [options.file2] options.file2 = os.path.abspath(options.file2) options.input_directory = "" options.pattern = "" options.extension = "" if options.extension == 'bam' or options.pattern.endswith('bam') or \ options.pattern.endswith('bed'): ff = FileFactory(data) else: ff = FastQFactory(data, read_tag=options.input_readtag, verbose=options.verbose) if options.pipeline == 'quality_control' or options.pipeline == 'rnaseq': # check combo flag = int( "%s%s%s%s%s" % (int(bool(options.no_adapters)), int(bool(options.design)), int(bool(options.adapters)), int(bool( options.adapter_fwd)), int(bool(options.adapter_rev))), 2) if flag not in [16, 12, 6, 4, 2, 3]: logger.critical( "You must use a design experimental file using --design" " and --adapters to indicate the type of adapters (PCRFree" " or Nextera), or provide the adapters directly as a " " string (or a file) using --adapter_fwd (AND --adapter_" "rev for paired-end data). A third way is to set --adapters" " to either Nextera, PCRFree, Rubicon or universal in which case " " all adapters will be used (slower). Finally, you may use " " --no-adapters for testing purpose or if you know there " " is no adapters") sys.exit(1) # flag 12 (design + adapters when wrong args provided) if options.design and options.adapters not in adapters_choice: raise ValueError( "When using --design, you must also " "provide the type of adapters using --adapters (set to " "one of %s )" % adapters_choice) if options.design and options.adapters: from sequana import FindAdaptersFromDesign fa = FindAdaptersFromDesign(options.design, options.adapters) fa.check() # flag 12 (design + adapters with correct args) elif options.design and options.adapters in adapters_choice: options.adapters_fwd = options.adapters options.adapters_rev = options.adapters elif options.no_adapters: options.adapter_fwd = "XXXX" options.adapter_rev = "XXXX" else: if options.adapter_fwd is None: if options.adapters not in ["universal"] + adapters_choice: msg = "Incorrect adapter choice %s. " % options.adapters msg += "Correct values are :\n" for this in ['universal'] + adapters_choice: msg += " - {}\n ".format(this) logger.error(msg) raise ValueError # flag 4 if options.adapters == "universal": options.adapter_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC" options.adapter_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA" # flag 4 else: # Let the pipeline handle the names options.adapter_fwd = options.adapters options.adapter_rev = options.adapters # flag 2/3 else: if options.adapter_fwd: # Could be a string or a file. If a file, check the format if os.path.exists(options.adapter_fwd): AdapterReader(options.adapter_fwd) options.adapter_fwd = "file:%s" % options.adapter_fwd if options.adapter_rev: # Could be a string or a file. If a file, check the format if os.path.exists(options.adapter_rev): AdapterReader(options.adapter_rev) options.adapter_rev = "file:%s" % options.adapter_rev if options.design: # Just check the format adapter_finder = FindAdaptersFromDesign(options.design, options.adapters) # If all options are valid, we can now create the tree structure sequana_init(options)
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.level if options.update_taxonomy is True: from sequana.taxonomy import Taxonomy tax = Taxonomy() from sequana import sequana_config_path as cfg logger.info( "Will overwrite the local database taxonomy.dat in {}".format(cfg)) tax.download_taxonomic_file(overwrite=True) sys.exit(0) # We put the import here to make the --help faster from sequana import KrakenPipeline from sequana.kraken import KrakenSequential devtools = DevTools() if options.download: from sequana import KrakenDownload kd = KrakenDownload() kd.download(options.download) sys.exit() fastq = [] if options.file1: devtools.check_exists(options.file1) fastq.append(options.file1) if options.file2: devtools.check_exists(options.file2) fastq.append(options.file2) from sequana import sequana_config_path as scfg if options.databases is None: logger.critical("You must provide a database") sys.exit(1) databases = [] for database in options.databases: if database == "toydb": database = "kraken_toydb" elif database == "minikraken": database = "minikraken_20141208" if os.path.exists(scfg + os.sep + database): # in Sequana path databases.append(scfg + os.sep + database) elif os.path.exists(database): # local database databases.append(database) else: msg = "Invalid database name (%s). Neither found locally " msg += "or in the sequana path %s; Use the --download option" raise ValueError(msg % (database, scfg)) output_directory = options.directory + os.sep + "kraken" devtools.mkdirs(output_directory) # if there is only one database, use the pipeline else KrakenHierarchical _pathto = lambda x: "{}/kraken/{}".format(options.directory, x) if x else x if len(databases) == 1: logger.info("Using 1 database") k = KrakenPipeline(fastq, databases[0], threads=options.thread, output_directory=output_directory, confidence=options.confidence) k.run(output_filename_classified=_pathto(options.classified_out), output_filename_unclassified=_pathto(options.unclassified_out)) else: logger.info("Using %s databases" % len(databases)) k = KrakenSequential(fastq, databases, threads=options.thread, output_directory=output_directory + os.sep, force=True, keep_temp_files=options.keep_temp_files, output_filename_unclassified=_pathto( options.unclassified_out), confidence=options.confidence) k.run(output_prefix="kraken") # This statements sets the directory where HTML will be saved from sequana.utils import config config.output_dir = options.directory # output_directory first argument: the directory where to find the data # output_filename is relative to the config.output_dir defined above kk = KrakenModule(output_directory, output_filename="summary.html") logger.info("Open ./%s/summary.html" % options.directory) logger.info("or ./%s/kraken/kraken.html" % options.directory) if options.html is True: ss.onweb()
def check_options(self, options): """ """ design = options.cutadapt_design_file adapter_choice = options.cutadapt_adapter_choice adapter_fwd = options.cutadapt_fwd adapter_rev = options.cutadapt_rev if design: if adapter_fwd or adapter_rev: logger.critical( "When using --cutadapt-design-file, one must not" " set the forward/reverse adapters with --cutadapt-fwd" " and/or --cutadapt-rev\n\n" + self.description) sys.exit(1) # otherwise, we just check the format but we need the adapter choice if options.cutadapt_adapter_choice in [None, 'none']: logger.critical( "When using --cutadapt-design-file, you must also" " provide the type of adapters using --cutadapt-adapter-choice" " (set to one of %s )" % self.adapters_choice) sys.exit(1) from sequana import FindAdaptersFromDesign fa = FindAdaptersFromDesign(design, options.cutadapt_adapter_choice) try: fa.check() except: logger.critical("Your design file contains indexes not found " "in the list of adapters from {}".format( options.cutadapt_adapter_choice)) sys.exit(1) # No design provided here below # do we need to remove adapters at all ? elif options.cutadapt_adapter_choice == "none": options.cutadapt_adapter_choice = None options.cutadapt_fwd = "XXXX" options.cutadapt_rev = "XXXX" # or just the universal ones ? elif options.cutadapt_adapter_choice == "universal": options.cutadapt_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC" options.cutadapt_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA" # or do we have a string or files provided for the fwd/rev ? elif options.cutadapt_adapter_choice is None: if options.cutadapt_fwd: # Could be a string or a file. If a file, check the format if os.path.exists(options.cutadapt_fwd): AdapterReader(options.cutadapt_fwd) options.cutadapt_fwd = "file:{}".format( os.path.abspath(options.cutadapt_fwd)) if options.cutadapt_rev: # Could be a string or a file. If a file, check the format if os.path.exists(options.cutadapt_rev): AdapterReader(options.cutadapt_rev) options.cutadapt_rev = "file:{}".format( os.path.abspath(options.cutadapt_rev)) elif options.cutadapt_adapter_choice: # nothing to do, the cutadapt rules from sequana will use # the adapter_choice, and fill the fwd/rev automatically pass
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # ============================================== sanity checks if not os.path.exists(options.samplesheet): logger.error(f"{options.samplesheet} file does not exists") sys.exit(1) if not os.path.exists(options.bcl_directory): logger.error(f"{options.bcl_directory} file does not exists") sys.exit(1) # Check the sample sheet from sequana import iem try: samplesheet = iem.IEM(options.samplesheet) samplesheet.validate() except Exception as err: logger.critical(err) logger.critical( """Your sample sheet seems to be incorrect. Before running the pipeline you will have to fix it. You may use 'sequana samplesheet --quick-fix'""") # NextSeq runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml" # HiSeq runparam_2 = options.bcl_directory + os.sep + "runParameters.xml" if os.path.exists(runparam_1): runparam = runparam_1 elif os.path.exists(runparam_2): runparam = runparam_2 else: runparam = None logger.warning("RunParameters.xml or runParameters.xml file not found") if runparam: with open(runparam, "r") as fin: data = fin.read() if "NextSeq" in data and options.merging_strategy != "merge": if options.merging_strategy == "none_and_force": msg = "This is a NextSeq. You set the --merging-strategy to" msg += " none_and_force. So, we proceed with no merging strategy" logger.warning(msg) if options.merging_strategy == "none": msg = "This is a NextSeq run. You must set the " msg += " --merging-strategy to 'merge'." logger.warning(msg) sys.exit(1) if options.from_project is None: cfg = manager.config.config cfg.general.input_directory = os.path.abspath(options.bcl_directory) cfg.bcl2fastq.threads = options.threads cfg.bcl2fastq.barcode_mismatch = options.mismatch cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet) from sequana.iem import IEM ss = IEM(cfg.bcl2fastq.samplesheet_file) ss.validate() # this is defined by the working_directory #cfg.bcl2fastq.output_directory = "." cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression if options.merging_strategy == "merge": cfg.bcl2fastq.merge_all_lanes = True elif options.merging_strategy in ["none", "none_and_force"]: cfg.bcl2fastq.merge_all_lanes = False # if options.mars_seq: cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 " if options.merging_strategy in ["merge"]: logger.warning( "with --mars-seq option, the merging strategy should be none_and_force" ) cfg.bcl2fastq.merge_all_lanes = False # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown(check_input_files=False) if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) from sequana.pipelines_common import SequanaManager # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # Do we need the indexing ? if options.aligner == "bowtie2": if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"): logger.info("Indexing found for {}.".format("bowtie2")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie2")) cfg.general.indexing = True elif options.aligner == "star": if os.path.exists(prefix + f"/star/SAindex"): logger.info("Indexing found for {}.".format("STAR")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "STAR")) cfg.general.indexing = True elif options.aligner == "bowtie1": if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"): logger.info("Indexing found for {}.".format("bowtie1")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie1")) cfg.general.indexing = True elif options.aligner == "salmon": if os.path.exists(cfg.general.genome_directory + "/salmon/salmon.done"): logger.info("Indexing found for {}.".format("salmon")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "salmon")) cfg.general.indexing = True #options.do_indexing cfg.general.force_indexing = options.force_indexing cfg.general.rRNA_feature = options.rRNA_feature cfg.general.contaminant_file = options.contaminant_file if options.rRNA_feature and options.contaminant_file: logger.warning( "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file" ) sys.exit(1) # --------------------------------------------------------- cutadapt cfg.cutadapt.do = not options.skip_cutadapt manager.update_config(cfg, options, "cutadapt") # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file # -------------------------------------------------------- RNAdiff cfg.rnadiff.mode = options.rnadiff_mode import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath( cfg["general"]["genome_directory"]) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.get_df() valid_types = gff.get_types() # first check the rRNA feature if cfg['general']["rRNA_feature"] and \ cfg['general']["rRNA_feature"] not in valid_types: logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format(valid_types)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "checking your input GFF file and feature counts options") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(',') gff.save_gff_filtered(features=fc_types, filename='custom.gff') cfg.general.custom_gff = 'custom.gff' for fc_type in fc_types: S = sum(df_gff['type'] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_types)) sys.exit() else: logger.info("Found {} {} entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} {} entries for attribute '{}' [{} unique entries]" .format(S, fc_attr, fc_type, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) logger.name = "sequana_rnaseq" logger.info(f"#Welcome to sequana_rnaseq pipeline.") # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # mutually exclusive options if options.contaminant_file: cfg.general.contaminant_file = os.path.abspath( options.contaminant_file) logger.warning( "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored" ) cfg.general.rRNA_feature = None else: cfg.general.rRNA_feature = options.rRNA_feature # --------------------------------------------------------- trimming cfg.trimming.software_choice = options.trimming_software_choice cfg.trimming.do = not options.disable_trimming qual = options.trimming_quality if options.trimming_software_choice in ["cutadapt", "atropos"]: cfg.cutadapt.tool_choice = options.trimming_software_choice cfg.cutadapt.fwd = options.trimming_adapter_read1 cfg.cutadapt.rev = options.trimming_adapter_read2 cfg.cutadapt.m = options.trimming_minimum_length cfg.cutadapt.mode = options.trimming_cutadapt_mode cfg.cutadapt.options = options.trimming_cutadapt_options # trim Ns -O 6 cfg.cutadapt.quality = 30 if qual == -1 else qual else: cfg.fastp.minimum_length = options.trimming_minimum_length cfg.fastp.quality = 15 if qual == -1 else qual cfg.fastp.fwd = options.trimming_adapter_read1 cfg.fastp.rev = options.trimming_adapter_read2 cfg.fastp.options = " --cut_tail " cfg.fastp.disable_quality_filtering = False cfg.fastp.disable_adapter_trimming = False # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc if options.do_rnaseqc: if options.rnaseqc_gtf_file is None: logger.warning( "You asked for RNA_seqc QC assessements but no GTF" " file provided; Please use --rnaseqc-gtf-file option. Switching off in your" " config file and continuing. You may use 'sequana gff2gtf input.gff' to create" " the gtf file") cfg.rnaseqc.do = False if options.aligner in ["salmon"]: logger.warning( "You asked for RNA_seqc QC assessements but no" " BAM will be generated by the salmon aligner. Switching off this option. " ) cfg.rnaseqc.do = False cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file cfg.rseqc.do = options.do_rseqc cfg.rseqc.bed_file = options.rseqc_bed_file # -------------------------------------------------------- RNAdiff import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "Checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath(cfg.general.genome_directory) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.df # This takes one minute on eukaryotes. No need to valid_features = gff.features # about 3 seconds valid_attributes = gff.attributes # about 10 seconds # first check the rRNA feature if (cfg["general"]["rRNA_feature"] and cfg["general"]["rRNA_feature"] not in valid_features): logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format( valid_features)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "Checking your input GFF file and feature counts options.") logger.info( f"You chose '{fc_type}' feature and '{fc_attr}' attribute") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(",") gff.save_gff_filtered(features=fc_types, filename="custom.gff") cfg.general.custom_gff = "custom.gff" for fc_type in fc_types: S = sum(df_gff["genetic_type"] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_features)) sys.exit() else: logger.info("Found {} '{}' entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("genetic_type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} '{}' entries for the attribute [{} unique entries]" .format(S, fc_attr, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)