def __init__(self, pattern="**/summary.json", output_filename=None, verbose=True, **kargs): super().__init__() from sequana import logger logger.setLevel("INFO") if verbose is False: logger.setLevel("WARNING") logger.info( "Sequana Summary is still a tool in progress and have been " + " tested with the quality_control pipeline only for now.") self.title = "Sequana multiple summary" self.devtools = DevTools() self.filenames = list(glob.iglob(pattern, recursive=True)) self.summaries = [ReadSummary(filename) for filename in self.filenames] self.projects = [ ReadSummary(filename).data['project'] for filename in self.filenames ] self.create_report_content() self.create_html(output_filename)
def main(args=None): if args is None: args = sys.argv[:] print(purple("Welcome to sequana_substractor")) print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL")) user_options = Options(prog="sequana_substractor") if len(args) == 1: args.append("--help") if "--version" in sys.argv: import sequana print(sequana.version) sys.exit(0) options = user_options.parse_args(args[1:]) logger.setLevel(options.level) # build the references list references = [] if options.reference: references.append(options.reference) if options.references: references = options.references options.references = references references = [] # expand globs if any for ref in options.references: references.extend(glob.glob(ref)) logger.info("{} references provided: {}".format(len(references), ",".join(references))) # call the entire machinery here sub = Substractor(options.input, references, options.outdir, options.mapper, options.threads) sub.run(options.outfile)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # ============================================== sanity checks if not os.path.exists(options.samplesheet): logger.error(f"{options.samplesheet} file does not exists") sys.exit(1) if not os.path.exists(options.bcl_directory): logger.error(f"{options.bcl_directory} file does not exists") sys.exit(1) # Check the sample sheet from sequana import iem try: samplesheet = iem.IEM(options.samplesheet) samplesheet.validate() except Exception as err: logger.critical(err) logger.critical( """Your sample sheet seems to be incorrect. Before running the pipeline you will have to fix it. You may use 'sequana samplesheet --quick-fix'""") # NextSeq runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml" # HiSeq runparam_2 = options.bcl_directory + os.sep + "runParameters.xml" if os.path.exists(runparam_1): runparam = runparam_1 elif os.path.exists(runparam_2): runparam = runparam_2 else: runparam = None logger.warning("RunParameters.xml or runParameters.xml file not found") if runparam: with open(runparam, "r") as fin: data = fin.read() if "NextSeq" in data and options.merging_strategy != "merge": if options.merging_strategy == "none_and_force": msg = "This is a NextSeq. You set the --merging-strategy to" msg += " none_and_force. So, we proceed with no merging strategy" logger.warning(msg) if options.merging_strategy == "none": msg = "This is a NextSeq run. You must set the " msg += " --merging-strategy to 'merge'." logger.warning(msg) sys.exit(1) if options.from_project is None: cfg = manager.config.config cfg.general.input_directory = os.path.abspath(options.bcl_directory) cfg.bcl2fastq.threads = options.threads cfg.bcl2fastq.barcode_mismatch = options.mismatch cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet) from sequana.iem import IEM ss = IEM(cfg.bcl2fastq.samplesheet_file) ss.validate() # this is defined by the working_directory #cfg.bcl2fastq.output_directory = "." cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression if options.merging_strategy == "merge": cfg.bcl2fastq.merge_all_lanes = True elif options.merging_strategy in ["none", "none_and_force"]: cfg.bcl2fastq.merge_all_lanes = False # if options.mars_seq: cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 " if options.merging_strategy in ["merge"]: logger.warning( "with --mars-seq option, the merging strategy should be none_and_force" ) cfg.bcl2fastq.merge_all_lanes = False # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown(check_input_files=False) if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) from sequana.pipelines_common import SequanaManager # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # Do we need the indexing ? if options.aligner == "bowtie2": if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"): logger.info("Indexing found for {}.".format("bowtie2")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie2")) cfg.general.indexing = True elif options.aligner == "star": if os.path.exists(prefix + f"/star/SAindex"): logger.info("Indexing found for {}.".format("STAR")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "STAR")) cfg.general.indexing = True elif options.aligner == "bowtie1": if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"): logger.info("Indexing found for {}.".format("bowtie1")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie1")) cfg.general.indexing = True elif options.aligner == "salmon": if os.path.exists(cfg.general.genome_directory + "/salmon/salmon.done"): logger.info("Indexing found for {}.".format("salmon")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "salmon")) cfg.general.indexing = True #options.do_indexing cfg.general.force_indexing = options.force_indexing cfg.general.rRNA_feature = options.rRNA_feature cfg.general.contaminant_file = options.contaminant_file if options.rRNA_feature and options.contaminant_file: logger.warning( "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file" ) sys.exit(1) # --------------------------------------------------------- cutadapt cfg.cutadapt.do = not options.skip_cutadapt manager.update_config(cfg, options, "cutadapt") # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file # -------------------------------------------------------- RNAdiff cfg.rnadiff.mode = options.rnadiff_mode import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath( cfg["general"]["genome_directory"]) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.get_df() valid_types = gff.get_types() # first check the rRNA feature if cfg['general']["rRNA_feature"] and \ cfg['general']["rRNA_feature"] not in valid_types: logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format(valid_types)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "checking your input GFF file and feature counts options") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(',') gff.save_gff_filtered(features=fc_types, filename='custom.gff') cfg.general.custom_gff = 'custom.gff' for fc_type in fc_types: S = sum(df_gff['type'] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_types)) sys.exit() else: logger.info("Found {} {} entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} {} entries for attribute '{}' [{} unique entries]" .format(S, fc_attr, fc_type, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) logger.name = "sequana_rnaseq" logger.info(f"#Welcome to sequana_rnaseq pipeline.") # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # mutually exclusive options if options.contaminant_file: cfg.general.contaminant_file = os.path.abspath( options.contaminant_file) logger.warning( "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored" ) cfg.general.rRNA_feature = None else: cfg.general.rRNA_feature = options.rRNA_feature # --------------------------------------------------------- trimming cfg.trimming.software_choice = options.trimming_software_choice cfg.trimming.do = not options.disable_trimming qual = options.trimming_quality if options.trimming_software_choice in ["cutadapt", "atropos"]: cfg.cutadapt.tool_choice = options.trimming_software_choice cfg.cutadapt.fwd = options.trimming_adapter_read1 cfg.cutadapt.rev = options.trimming_adapter_read2 cfg.cutadapt.m = options.trimming_minimum_length cfg.cutadapt.mode = options.trimming_cutadapt_mode cfg.cutadapt.options = options.trimming_cutadapt_options # trim Ns -O 6 cfg.cutadapt.quality = 30 if qual == -1 else qual else: cfg.fastp.minimum_length = options.trimming_minimum_length cfg.fastp.quality = 15 if qual == -1 else qual cfg.fastp.fwd = options.trimming_adapter_read1 cfg.fastp.rev = options.trimming_adapter_read2 cfg.fastp.options = " --cut_tail " cfg.fastp.disable_quality_filtering = False cfg.fastp.disable_adapter_trimming = False # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc if options.do_rnaseqc: if options.rnaseqc_gtf_file is None: logger.warning( "You asked for RNA_seqc QC assessements but no GTF" " file provided; Please use --rnaseqc-gtf-file option. Switching off in your" " config file and continuing. You may use 'sequana gff2gtf input.gff' to create" " the gtf file") cfg.rnaseqc.do = False if options.aligner in ["salmon"]: logger.warning( "You asked for RNA_seqc QC assessements but no" " BAM will be generated by the salmon aligner. Switching off this option. " ) cfg.rnaseqc.do = False cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file cfg.rseqc.do = options.do_rseqc cfg.rseqc.bed_file = options.rseqc_bed_file # -------------------------------------------------------- RNAdiff import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "Checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath(cfg.general.genome_directory) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.df # This takes one minute on eukaryotes. No need to valid_features = gff.features # about 3 seconds valid_attributes = gff.attributes # about 10 seconds # first check the rRNA feature if (cfg["general"]["rRNA_feature"] and cfg["general"]["rRNA_feature"] not in valid_features): logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format( valid_features)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "Checking your input GFF file and feature counts options.") logger.info( f"You chose '{fc_type}' feature and '{fc_attr}' attribute") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(",") gff.save_gff_filtered(features=fc_types, filename="custom.gff") cfg.general.custom_gff = "custom.gff" for fc_type in fc_types: S = sum(df_gff["genetic_type"] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_features)) sys.exit() else: logger.info("Found {} '{}' entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("genetic_type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} '{}' entries for the attribute [{} unique entries]" .format(S, fc_attr, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)