def copy_file(self, filename, target_dir): """ Copy a file to a target directory in report dir. Return the relative path of your file. :param str filename: file to copy. :param str target_dir: directory where to copy. Return relative path of the new file location. """ directory = config.output_dir + os.sep + target_dir try: os.makedirs(directory) except FileExistsError: if os.path.isdir(directory): pass else: msg = "{0} exist and it is not a directory".format(directory) logger.error(msg) raise FileExistsError try: shutil.copy(filename, directory) except FileNotFoundError: msg = "{0} doesn't exist".format(filename) raise FileNotFoundError return target_dir + os.sep + os.path.basename(filename)
def __init__(self, cutadapt_log, sample_name, output_filename=None): """ :param input: """ super().__init__() # Expected input data is the cutadapt log file if os.path.exists(cutadapt_log) is False: logger.error("This file {} does not exist".format(cutadapt_log)) self.input_filename = cutadapt_log self.sample_name = sample_name self.jinja = {} self.data = {} atropos_log = cutadapt_log.replace(".txt", ".json") if os.path.exists(atropos_log): self.input_mode = "atropos" self.read_data() # store the rawdata self.parse_atropos(atropos_log) else: self.input_mode = "cutadapt" self.read_data() # store the rawdata self.parse_cutadapt() self._data_histograms = self._get_histogram_data() self.create_report_content() self.create_html(output_filename)
def __init__(self, filename, **kwargs): """.. rubric:: constructor :param str filename: a vcf file. :param kwargs: any arguments accepted by vcf.Reader """ try: self.filename = filename filin = open(filename, "r") vcf.Reader.__init__(self, fsock=filin, **kwargs) self._get_start_index() except FileNotFoundError as e: logger.error( "FileNotFoundError({0}): {1}".format(e.errno, e.strerror) ) raise FileNotFoundError # initiate filters dictionary self._filters_params = { 'freebayes_score': 0, 'frequency': 0, 'min_depth': 0, 'forward_depth': 0, 'reverse_depth': 0, 'strand_ratio': 0, } self._is_joint = self._check_if_joint()
def __init__(self, reference, log=None): """.. rubric:: Constructor :param reference: annotation reference. :param file_format: format of your file. ('only genbank actually') :param log: log file """ # Check if the input file exist if os.path.isfile(reference): self.reference = reference self.ref_name = os.path.basename(reference).split('.')[0] else: logger.error("FileNotFoundError: The file " + reference + " does not exist") sys.exit(1) # Set the log file self.log_file = log if log is not None: if os.path.isfile(log): os.remove(log) # Check if snpEff.config is present if not os.path.exists("snpEff.config"): self._get_snpeff_config() # Create custom database if not os.path.exists("data" + os.sep + self.ref_name + os.sep + "snpEffectPredictor.bin"): self._add_custom_db() elif not self._check_database(self.ref_name): self._add_db_in_config()
def __init__(self, annotation, log=None, snpeff_datadir="data", fastafile=None): """.. rubric:: Constructor :param annotation: annotation reference. :param file_format: format of your file. ('only genbank actually') :param log: log file :param snpeff_datadir: :param fastafile: if a GFF is used, you must provide the FASTA input file as well """ # Check if the input file exist if os.path.isfile(annotation): self.annotation = annotation self.fastafile = fastafile self.ref_name = os.path.basename(annotation).split('.')[0] if self.annotation.endswith( ".genbank") or self.annotation.endswith(".gbk"): self.format = "gbk" elif self.annotation.endswith(".gff3") or self.annotation.endswith( ".gff"): self.format = "gff3" else: logger.error("Format must be genbank or gff3") sys.exit(1) else: logger.error("FileNotFoundError: The file " + annotation + " does not exist") sys.exit(1) # Keep data directory where everything will be saved self.snpeff_datadir = snpeff_datadir # Set the log file self.log_file = log if log is not None: if os.path.isfile(log): os.remove(log) # Check if snpEff.config is present if not os.path.exists("snpEff.config"): logger.info("snpEff.config file not found, creating one") self._get_snpeff_config() else: logger.info("snpEff.config file exists already. Using it.") # Create custom database if not os.path.exists(self.snpeff_datadir + os.sep + self.ref_name + os.sep + "snpEffectPredictor.bin"): self._add_custom_db() elif not self._check_database(self.ref_name): self._add_db_in_config() else: logger.info("DB already added in your config and database")
def check(self): found = 0 for sample in self.sample_names: try: self.get_adapters_from_sample(sample) found += 1 except: logger.error("No index found for sample %s" % sample) if found == 0: raise ValueError("None of the sample match any of the adapters")
def _get_package_location(self): try: fullname = "sequana_{}".format(self.name) import pkg_resources info = pkg_resources.get_distribution(fullname) sharedir = os.sep.join( [info.location, "sequana_pipelines", self.name, 'data']) except pkg_resources.DistributionNotFound as err: logger.error("package provided (%s) not installed." % package) raise return sharedir
def get_roi(self): """Keep positions with zscore outside of the thresholds range. :return: a dataframe from :class:`FilteredGenomeCov` .. note:: depends on the :attr:`thresholds` low and high values. """ features = self.bed.feature_dict try: second_high = self.thresholds.high2 second_low = self.thresholds.low2 query = "zscore > @second_high or zscore < @second_low" # in the genbank, the names appears as e.g. JB12345 # but in the fasta or BED files, it may be something like # gi|269939526|emb|FN433596.1| # so they do not match. We can try to guess it alternative = None if features: if self.chrom_name not in features.keys(): msg = """Chromosome name (%s) not found in the genbank. Make sure the chromosome names in the BAM/BED files are compatible with the genbank content. Genbank files contains the following keys """ for this in features.keys(): msg += "\n - %s" % this alternative = [x for x in self.chrom_name.split("|") if x] alternative = alternative[-1] # assume the accession is last alternative = alternative.split('.')[0] # remove version if alternative in features.keys(): msg += "\n Guessed the chromosome name to be: %s" % alternative else: features = None logger.warning(msg % self.chrom_name) if features: if alternative: return FilteredGenomeCov(self.df.query(query), self.thresholds, features[alternative]) else: return FilteredGenomeCov(self.df.query(query), self.thresholds, features[self.chrom_name]) else: return FilteredGenomeCov(self.df.query(query), self.thresholds) except KeyError: logger.error("Column zscore is missing in data frame.\n" "You must run compute_zscore before get low coverage." "\n\n", self.__doc__) sys.exit(1)
def get_sequana_adapters(type_, direction): """Return path to a list of adapters in FASTA format :param tag: PCRFree, Rubicon, Nextera :param type_: fwd, rev, revcomp :return: path to the adapter filename """ # search possible types registered = _get_registered_adapters() if type_ not in registered: logger.error("This adapter type (%s) is not valid" % type_) logger.error("choose one in %s types" % registered) raise ValueError directions = ["fwd", "rev", "revcomp"] if direction not in directions: logger.error("This kind of tag (%s) is not valid" % direction) logger.error("choose one in %s " % directions) raise ValueError try: this = sequana_data("adapters_%s_%s.fa" % (type_, direction)) logger.warning("Rename {} (remove the adapters_ prefix)".format(this)) return this except: return sequana_data("%s_%s.fa" % (type_, direction))
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) from sequana.pipelines_common import SequanaManager # the real stuff is here. manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.level = options.level if options.from_project is None: # fill the config file with input parameters cfg = manager.config.config # There is no need for input pattern / parameters in this pipeline, just # the input path where fastq files are to be found. cfg.input_pattern = options.input_pattern cfg.flowcell_paths = [ os.path.abspath(x) for x in options.flowcell_paths ] if len(cfg.flowcell_paths) == 1: logger.error( "To merge flowcells, you must provide at least two directories" ) sys.exit(1) for path in cfg.flowcell_paths: manager.exists(path) # finalise the command and save it; copy the snakemake. update the config # file and save it. # No need to check for input files since the # input_directory / read_tag is not used in this pipeline manager.teardown(check_input_files=False)
def __init__(self, input_filename, **kwargs): """ :param str filename: a bcf file. :param kwargs: any arguments accepted by VariantFile. """ try: super().__init__(input_filename, **kwargs) except OSError: logger.error("OSError: {0} doesn't exist.".format(input_filename)) raise OSError # initiate filters dictionary self._filters = {'freebayes_score': 0, 'frequency': 0, 'min_depth': 0, 'forward_depth':0, 'reverse_depth':0, 'strand_ratio': 0}
def __init__(self, filename, verbose=True, **kwargs): """.. rubric:: constructor :param str filename: a vcf file. :param kwargs: any arguments accepted by vcf.Reader """ try: self.filename = filename filin = open(filename, "r") vcf.Reader.__init__(self, fsock=filin, **kwargs) self._get_start_index() except FileNotFoundError as e: logger.error("FileNotFoundError({0}): {1}".format( e.errno, e.strerror)) raise FileNotFoundError if verbose: print("Found VCF version {}".format(self.version))
def _add_custom_db(self): """ Add your custom file in the local snpEff database. """ # create directory and copy annotation file logger.info("adding custom DB using your input file(s)") logger.info(" - {}".format(self.annotation)) if self.fastafile: logger.info(" - {}".format(self.fastafile)) genome_dir = "data" + os.sep + self.ref_name + os.sep try: os.makedirs(genome_dir) except FileExistsError: pass # add new annotation file in config file self._add_db_in_config() if self.format == "gbk": shutil.copyfile(self.annotation, genome_dir + "genes.gbk") snpeff_build_line = ["snpEff", "build", "-genbank", '-v'] snpeff_build_line += [self.ref_name] elif self.format == "gff3": shutil.copyfile(self.annotation, genome_dir + "genes.gff") if self.fastafile is None or not os.path.exists(self.fastafile): logger.error("Input file {} does not exist".format( self.fastafile)) sys.exit(1) shutil.copyfile(self.fastafile, genome_dir + "sequences.fa") snpeff_build_line = ["snpEff", "build", "-gff3", '-v'] snpeff_build_line += [self.ref_name] if self.log_file: with open(self.log_file, "ab") as fl: snp_build = sp.Popen(snpeff_build_line, stderr=fl, stdout=fl) else: snp_build = sp.Popen(snpeff_build_line) snp_build.wait() rc = snp_build.returncode if rc != 0: logger.error("snpEff build return a non-zero code") sys.exit(rc)
def copy_requirements(self): # FIXME # code redundant with snaketools.config.copy_requirements if 'requirements' not in self.config.config: return for requirement in self.config.config.requirements: if os.path.exists(requirement): try: shutil.copy(requirement, target) except: pass # the target and input may be the same elif requirement.startswith('http') is False: try: logger.info('Copying {} from sequana pipeline {}'.format( requirement, self.name)) path = self.datapath + os.sep + requirement shutil.copy(path, self.workdir) except Exception as err: print(err) msg = "This requirement %s was not found in sequana." logger.error(msg) sys.exit(1)
def check_input_files(self, stop_on_error=True): # Sanity checks cfg = self.config.config filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern) logger.info("Found {} files matching your input pattern ({})".format( len(filenames), cfg.input_pattern)) if len(filenames) == 0: logger.critical( "Found no files with your matching pattern ({})".format( cfg.input_pattern)) if "*" not in cfg.input_pattern and "?" not in cfg.input_pattern: logger.critical( "No wildcard used in your input pattern, please use a * or ? character" ) if stop_on_error: sys.exit(1) from sequana import FastQFactory try: ff = FastQFactory(cfg.input_directory + os.sep + cfg.input_pattern, read_tag=cfg.input_readtag) # This tells whether the data is paired or not if ff.paired: paired = "paired reads" else: paired = "single-end reads" logger.info( "Your input data seems to be made of {}".format(paired)) except: logger.error( """Input data is not fastq-compatible with sequana pipelines. You may want to set the read_tag to empty string or None if you wish to analyse non-fastQ files (e.g. BAM)""") sys.exit(1)
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.logging_level if options.download_reference: logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database)) from bioservices.apps import download_fasta as df df.download_fasta(options.download_reference, method=options.database) if options.download_genbank is None: return if options.download_genbank: logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database)) from sequana.snpeff import download_fasta_and_genbank download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False) return if options.genbank: assert os.path.exists(options.genbank), \ "%s does not exists" % options.genbank logger.info("Reading %s. This may take time depending on " "your input file" % options.input) # Convert BAM to BED if options.input.endswith(".bam"): bedfile = options.input.replace(".bam", ".bed") logger.info("Converting BAM into BED file") shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile)) elif options.input.endswith(".bed"): bedfile = options.input else: raise ValueError("Input file must be a BAM or BED file") # Set the thresholds if options.low_threshold is None: options.low_threshold = -options.threshold if options.high_threshold is None: options.high_threshold = options.threshold # and output directory config.output_dir = options.output_directory config.sample_name = os.path.basename(options.input).split('.')[0] # Now we can create the instance of GenomeCoverage if options.chromosome == -1: chrom_list = [] else: chrom_list = [options.chromosome] gc = GenomeCov(bedfile, options.genbank, options.low_threshold, options.high_threshold, options.double_threshold, options.double_threshold, chunksize=options.chunksize, chromosome_list=chrom_list) # if we have the reference, let us use it if options.reference: logger.info('Computing GC content') gc.compute_gc_content(options.reference, options.w_gc, options.circular) # Now we scan the chromosomes, if len(gc.chrom_names) == 1: logger.warning("There is only one chromosome. Selected automatically.") run_analysis(gc.chr_list[0], options, gc.feature_dict) elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names): msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names)) logger.error(msg) sys.exit(1) else: if options.chromosome == -1: chromosomes = gc.chrom_names # take all chromosomes else: # For user, we start at position 1 but in python, we start at zero chromosomes = [gc.chrom_names[options.chromosome-1]] logger.info("There are %s chromosomes/contigs." % len(gc)) for this in gc.chrom_names: data = (this, gc.positions[this]["start"], gc.positions[this]["end"]) logger.info(" {} (starting pos: {}, ending pos: {})".format(*data)) # here we read chromosome by chromosome to save memory. # However, if the data is small. for i, chrom in enumerate(chromosomes): logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i])) # since we read just one contig/chromosome, the chr_list contains # only one contig, so we access to it with index 0 run_analysis(gc.chr_list[i], options, gc.feature_dict) if options.skip_multiqc is False: logger.info("=========================") logger.info("Creating multiqc report") pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/") cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg) import subprocess proc = subprocess.Popen(cmd.split(), cwd=options.output_directory) proc.wait()
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.logging_level if options.download_reference: logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database)) from bioservices.apps import download_fasta as df df.download_fasta(options.download_reference, method=options.database) if options.download_genbank is None: return if options.download_genbank: logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database)) from sequana.snpeff import download_fasta_and_genbank download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False) return if options.genbank: assert os.path.exists(options.genbank), \ "%s does not exists" % options.genbank logger.info("Reading %s. This may take time depending on " "your input file" % options.input) # Convert BAM to BED if options.input.endswith(".bam"): bedfile = options.input.replace(".bam", ".bed") logger.info("Converting BAM into BED file") shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile)) elif options.input.endswith(".bed"): bedfile = options.input else: raise ValueError("Input file must be a BAM or BED file") # Set the thresholds if options.low_threshold is None: options.low_threshold = -options.threshold if options.high_threshold is None: options.high_threshold = options.threshold # Now we can create the instance of GenomeCoverage if options.chromosome == -1: chrom_list = [] else: chrom_list = [options.chromosome] gc = GenomeCov(bedfile, options.genbank, options.low_threshold, options.high_threshold, options.double_threshold, options.double_threshold, chunksize=options.chunksize, chromosome_list=chrom_list) # if we have the reference, let us use it if options.reference: logger.info('Computing GC content') gc.compute_gc_content(options.reference, options.w_gc, options.circular) # Now we scan the chromosomes, if len(gc.chrom_names) == 1: logger.warning("There is only one chromosome. Selected automatically.") run_analysis(gc.chr_list[0], options, gc.feature_dict) elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names): msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names)) logger.error(msg) sys.exit(1) else: if options.chromosome == -1: chromosomes = gc.chrom_names # take all chromosomes else: # For user, we start at position 1 but in python, we start at zero chromosomes = [gc.chrom_names[options.chromosome-1]] logger.info("There are %s chromosomes/contigs." % len(gc)) for this in gc.chrom_names: end = gc.positions[this]["end"] start = gc.positions[this]["start"] data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end-start) logger.info(" {} (starting pos: {}, ending pos: {}, length: {})".format(*data)) # here we read chromosome by chromosome to save memory. # However, if the data is small. for i, chrom in enumerate(chromosomes): logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i])) # since we read just one contig/chromosome, the chr_list contains # only one contig, so we access to it with index 0 run_analysis(gc.chr_list[i], options, gc.feature_dict) # logging level seems to be reset to warning somewhere logger.level = options.logging_level if options.skip_multiqc is False: logger.info("Creating multiqc report") pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/") cmd = 'multiqc . -m sequana_coverage -f -c {} '.format(pathtocfg) import subprocess proc = subprocess.Popen(cmd.split(), cwd=options.output_directory) proc.wait() # stdout=subprocess.PIPE, stderr=subprocess.PIPE) #out, err = proc.communicate() #with open("multiqc.log", "w") as fout: # fout.write(err.decode()) logger.info("Done")
def download_fasta(self, filelist, output_dir=None, from_ena=True): """Download a FASTA (or list of) :param filelist: a name to find on the ENA web server OR the name of an accession number. .. warning:: The filename is named after the accession without .X number If there are several variant .1, .2 the later will be used. This should not happen if the list is properly defined. """ from bioservices import ENA if filelist.endswith(".txt") and os.path.exists(filelist) is False: logger.info( "Downloading list from http://www.ebi.ac.uk/genomes/%s" % filelist) data = urlopen("http://www.ebi.ac.uk/genomes/%s" % filelist).readlines() identifiers = [x.strip().decode() for x in data] elif filelist == "macaca": identifiers = [ "CM001276", "CM001277", "CM001278", "CM001279", "CM001280", "CM001281", "CM001282", "CM001283", "CM001284", "CM001285", "CM001286", "CM001287", "CM001288", "CM001289", "CM001290", "CM001291", "CM001292", "CM001293", "CM001294", "CM001295", "CM001296" ] elif filelist == "mus_musculus": #19 +x+y chromosomes + 5 mitochondrion # could also add strain C57BL. identifiers = [ "AY172335", "CM000209", "CM000210", "CM000211" "CM000212", "CM000213", "CM000214", "CM000215", "CM000216" "CM000217", "CM000218", "CM000219", "CM000220", "CM000221" "CM000222", "CM000223", "CM000224", "CM000225", "CM000226" "CM000227", "CM000228", "CM000229", "CM000225", "CM000226" "EF108342", "AB042432", "AY675564", "DQ874614" ] elif filelist == "worms": # Caernorhabditis briggsae and elegans identifiers = [ "AC186293", "FR847112", "FR847113", "FR847114", "FR847118", "FR847121", "FR847123", "BX284601", "BX284602", "BX284603", "BX284604", "BX284605", "BX284606" ] elif isinstance(filelist, str) and filelist in self._metadata.keys(): name = self._metadata[filelist][0] logger.info( "Downloading list from http://www.ebi.ac.uk/genomes/%s" % name) data = urlopen("http://www.ebi.ac.uk/genomes/%s" % name).readlines() identifiers = [x.strip().decode() for x in data] elif isinstance(filelist, list): identifiers = filelist[:] elif isinstance(filelist, str): # could be a single identifier or a filename (assuming a single # column) if os.path.exists(filelist): identifiers = [x for x in open(filelist).read().split()] identifiers = [x.strip() for x in identifiers] else: identifiers = [filelist] self._identifiers = identifiers self.results = self.ena_id_to_gi_number(identifiers) # do not use caching things this could be huge data sets. ena = ENA() if output_dir is None: output_dir = "." else: try: os.mkdir(output_dir) except: pass N = len(identifiers) pb = Progress(N) logger.info("Fetching all fasta from ENA") for i, identifier in enumerate(identifiers): filenames = glob.glob(output_dir + os.sep + "ENA_%s*" % identifier) if len(filenames) >= 1: pb.animate(i + 1) # no need to fetch and save the data it looks like... continue # download data from ENA data = ena.get_data(identifier, "fasta") # Split header and Fasta header, others = data.decode().split("\n", 1) # Source of failure: # - list and DB are not synchrone: e.g. some entries may be deleted if "suppressed" in header: continue if ">" not in header: continue # Do not use try/except since when it fails, this is a real issue name = header.strip(">").split(" ")[0] db, id_, acc = name.split("|") try: header = self.switch_header_to_gi(acc) except: logger.error("Failed for this entry:") logger.error(identifier) logger.error(header) logger.error(name) continue # Save to local file # WARNINGS: extension is .fa because kraken-build expects .fa files filename = "%s_%s.fa" % (db, acc.split(".")[0]) if output_dir: filename = output_dir + os.sep + filename with open(filename, "w") as fout: fout.write(header + "\n" + others) pb.animate(i + 1)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) logger.name = "sequana_rnaseq" logger.info(f"#Welcome to sequana_rnaseq pipeline.") # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # mutually exclusive options if options.contaminant_file: cfg.general.contaminant_file = os.path.abspath( options.contaminant_file) logger.warning( "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored" ) cfg.general.rRNA_feature = None else: cfg.general.rRNA_feature = options.rRNA_feature # --------------------------------------------------------- trimming cfg.trimming.software_choice = options.trimming_software_choice cfg.trimming.do = not options.disable_trimming qual = options.trimming_quality if options.trimming_software_choice in ["cutadapt", "atropos"]: cfg.cutadapt.tool_choice = options.trimming_software_choice cfg.cutadapt.fwd = options.trimming_adapter_read1 cfg.cutadapt.rev = options.trimming_adapter_read2 cfg.cutadapt.m = options.trimming_minimum_length cfg.cutadapt.mode = options.trimming_cutadapt_mode cfg.cutadapt.options = options.trimming_cutadapt_options # trim Ns -O 6 cfg.cutadapt.quality = 30 if qual == -1 else qual else: cfg.fastp.minimum_length = options.trimming_minimum_length cfg.fastp.quality = 15 if qual == -1 else qual cfg.fastp.fwd = options.trimming_adapter_read1 cfg.fastp.rev = options.trimming_adapter_read2 cfg.fastp.options = " --cut_tail " cfg.fastp.disable_quality_filtering = False cfg.fastp.disable_adapter_trimming = False # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc if options.do_rnaseqc: if options.rnaseqc_gtf_file is None: logger.warning( "You asked for RNA_seqc QC assessements but no GTF" " file provided; Please use --rnaseqc-gtf-file option. Switching off in your" " config file and continuing. You may use 'sequana gff2gtf input.gff' to create" " the gtf file") cfg.rnaseqc.do = False if options.aligner in ["salmon"]: logger.warning( "You asked for RNA_seqc QC assessements but no" " BAM will be generated by the salmon aligner. Switching off this option. " ) cfg.rnaseqc.do = False cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file cfg.rseqc.do = options.do_rseqc cfg.rseqc.bed_file = options.rseqc_bed_file # -------------------------------------------------------- RNAdiff import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "Checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath(cfg.general.genome_directory) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.df # This takes one minute on eukaryotes. No need to valid_features = gff.features # about 3 seconds valid_attributes = gff.attributes # about 10 seconds # first check the rRNA feature if (cfg["general"]["rRNA_feature"] and cfg["general"]["rRNA_feature"] not in valid_features): logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format( valid_features)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "Checking your input GFF file and feature counts options.") logger.info( f"You chose '{fc_type}' feature and '{fc_attr}' attribute") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(",") gff.save_gff_filtered(features=fc_types, filename="custom.gff") cfg.general.custom_gff = "custom.gff" for fc_type in fc_types: S = sum(df_gff["genetic_type"] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_features)) sys.exit() else: logger.info("Found {} '{}' entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("genetic_type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} '{}' entries for the attribute [{} unique entries]" .format(S, fc_attr, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # ============================================== sanity checks if not os.path.exists(options.samplesheet): logger.error(f"{options.samplesheet} file does not exists") sys.exit(1) if not os.path.exists(options.bcl_directory): logger.error(f"{options.bcl_directory} file does not exists") sys.exit(1) # Check the sample sheet from sequana import iem try: samplesheet = iem.IEM(options.samplesheet) samplesheet.validate() except Exception as err: logger.critical(err) logger.critical( """Your sample sheet seems to be incorrect. Before running the pipeline you will have to fix it. You may use 'sequana samplesheet --quick-fix'""") # NextSeq runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml" # HiSeq runparam_2 = options.bcl_directory + os.sep + "runParameters.xml" if os.path.exists(runparam_1): runparam = runparam_1 elif os.path.exists(runparam_2): runparam = runparam_2 else: runparam = None logger.warning("RunParameters.xml or runParameters.xml file not found") if runparam: with open(runparam, "r") as fin: data = fin.read() if "NextSeq" in data and options.merging_strategy != "merge": if options.merging_strategy == "none_and_force": msg = "This is a NextSeq. You set the --merging-strategy to" msg += " none_and_force. So, we proceed with no merging strategy" logger.warning(msg) if options.merging_strategy == "none": msg = "This is a NextSeq run. You must set the " msg += " --merging-strategy to 'merge'." logger.warning(msg) sys.exit(1) if options.from_project is None: cfg = manager.config.config cfg.general.input_directory = os.path.abspath(options.bcl_directory) cfg.bcl2fastq.threads = options.threads cfg.bcl2fastq.barcode_mismatch = options.mismatch cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet) from sequana.iem import IEM ss = IEM(cfg.bcl2fastq.samplesheet_file) ss.validate() # this is defined by the working_directory #cfg.bcl2fastq.output_directory = "." cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression if options.merging_strategy == "merge": cfg.bcl2fastq.merge_all_lanes = True elif options.merging_strategy in ["none", "none_and_force"]: cfg.bcl2fastq.merge_all_lanes = False # if options.mars_seq: cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 " if options.merging_strategy in ["merge"]: logger.warning( "with --mars-seq option, the merging strategy should be none_and_force" ) cfg.bcl2fastq.merge_all_lanes = False # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown(check_input_files=False) if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def feature_dict(self, anything): logger.error("AttributeError: You can't set attribute.\n" "GenomeCov.feature_dict is set when" "GenomeCov.genbank_filename is set.") sys.exit(1)
def bed(self): logger.error("AttributeError: You can't set the ChromosomeCov.bed. " "Setting is done automatically when the class is " "created.")
def plot(self, kind="pie", cmap="tab20c", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. .. todo:: For a future release, we could use this kind of plot https://stackoverflow.com/questions/57720935/how-to-use-correct-cmap-colors-in-nested-pie-chart-in-matplotlib """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_db(list(self.taxons.index)) # we add the unclassified only if needed if self.unclassified > 0: df.loc[-1] = ["Unclassified"] * 8 data = self.taxons.copy() # we add the unclassified only if needed if self.unclassified > 0: data.loc[-1] = self.unclassified data = data / data.sum() * 100 assert threshold > 0 and threshold < 100 # everything below the threshold (1) is gather together and summarised # into 'others' others = data[data < threshold].sum() data = data[data >= threshold] names = df.loc[data.index]['name'] data.index = names.values if others > 0: data.loc['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) pylab.figure(figsize=(10, 8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data / data.sum() * 100 assert threshold > 0 and threshold < 100 others = data[data < threshold].sum() data = data[data > threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10, 8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def main(args=None): if args is None: args = sys.argv # whatever needs to be called by all pipeline before the options parsing from sequana_pipetools.options import before_pipeline before_pipeline(NAME) # option parsing including common epilog options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:]) from sequana.pipelines_common import SequanaManager # the real stuff is here manager = SequanaManager(options, NAME) # create the beginning of the command and the working directory manager.setup() from sequana import logger logger.setLevel(options.level) # fill the config file with input parameters if options.from_project is None: cfg = manager.config.config # --------------------------------------------------------- general cfg.general.genome_directory = os.path.abspath( options.genome_directory) cfg.general.aligner = options.aligner # genome name = cfg.genome.genome_directory genome_name = cfg.general.genome_directory.rsplit("/", 1)[1] prefix = cfg.general.genome_directory fasta = cfg.general.genome_directory + f"/{genome_name}.fa" if os.path.exists(fasta) is False: logger.critical( """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory.""" .format(fasta)) sys.exit() # Do we need the indexing ? if options.aligner == "bowtie2": if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"): logger.info("Indexing found for {}.".format("bowtie2")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie2")) cfg.general.indexing = True elif options.aligner == "star": if os.path.exists(prefix + f"/star/SAindex"): logger.info("Indexing found for {}.".format("STAR")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "STAR")) cfg.general.indexing = True elif options.aligner == "bowtie1": if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"): logger.info("Indexing found for {}.".format("bowtie1")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "bowtie1")) cfg.general.indexing = True elif options.aligner == "salmon": if os.path.exists(cfg.general.genome_directory + "/salmon/salmon.done"): logger.info("Indexing found for {}.".format("salmon")) cfg.general.indexing = False else: logger.info( "Indexing not found for {}. Planned to be run".format( "salmon")) cfg.general.indexing = True #options.do_indexing cfg.general.force_indexing = options.force_indexing cfg.general.rRNA_feature = options.rRNA_feature cfg.general.contaminant_file = options.contaminant_file if options.rRNA_feature and options.contaminant_file: logger.warning( "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file" ) sys.exit(1) # --------------------------------------------------------- cutadapt cfg.cutadapt.do = not options.skip_cutadapt manager.update_config(cfg, options, "cutadapt") # ---------------------------------------------------- others cfg.input_directory = os.path.abspath(options.input_directory) cfg.input_pattern = options.input_pattern cfg.input_readtag = options.input_readtag # ----------------------------------------------------- feature counts cfg.feature_counts.options = options.feature_counts_options cfg.feature_counts.strandness = options.feature_counts_strandness cfg.feature_counts.attribute = options.feature_counts_attribute cfg.feature_counts.feature = options.feature_counts_feature_type cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes # ------------------------------------------------------ optional cfg.igvtools.do = options.do_igvtools cfg.coverage.do = options.do_bam_coverage cfg.mark_duplicates.do = False if options.do_mark_duplicates: cfg.mark_duplicates.do = True # -------------------------------------------------------- RNAseqQC cfg.rnaseqc.do = options.do_rnaseqc cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file # -------------------------------------------------------- RNAdiff cfg.rnadiff.mode = options.rnadiff_mode import sequana_pipelines.rnaseq # SANITY CHECKS # -------------------------------------- do we find rRNA feature in the GFF ? # if we do not build a custom feature_counts set of options, no need to # check carfully the GFF; if users knows what he is doing; no need to # check the GFF either if options.skip_gff_check is False and "," not in cfg.feature_counts.feature: logger.info( "checking your input GFF file and rRNA feature if provided") from sequana.gff3 import GFF3 genome_directory = os.path.abspath( cfg["general"]["genome_directory"]) genome_name = genome_directory.rsplit("/", 1)[1] prefix_name = genome_directory + "/" + genome_name gff_file = prefix_name + ".gff" gff = GFF3(gff_file) df_gff = gff.get_df() valid_types = gff.get_types() # first check the rRNA feature if cfg['general']["rRNA_feature"] and \ cfg['general']["rRNA_feature"] not in valid_types: logger.error( "rRNA feature not found in the input GFF ({})".format( gff_file) + " This is probably an error. Please check the GFF content and /or" " change the feature name with --rRNA-feature based on the content" " of your GFF. Valid features are: {}".format(valid_types)) sys.exit() # then, check the main feature fc_type = cfg.feature_counts.feature fc_attr = cfg.feature_counts.attribute logger.info( "checking your input GFF file and feature counts options") # if only one feature (99% of the projet) if "," not in fc_type: fc_types = [fc_type] else: logger.info( "Building a custom GFF file (custom.gff) using Sequana. Please wait" ) fc_types = fc_type.split(',') gff.save_gff_filtered(features=fc_types, filename='custom.gff') cfg.general.custom_gff = 'custom.gff' for fc_type in fc_types: S = sum(df_gff['type'] == fc_type) if S == 0: logger.error( "Found 0 entries for feature '{}'. Please choose a valid feature from: {}" .format(fc_type, valid_types)) sys.exit() else: logger.info("Found {} {} entries".format(S, fc_type)) # now we check the attribute: dd = df_gff.query("type==@fc_type") attributes = [y for x in dd.attributes for y in x.keys()] S = attributes.count(fc_attr) if S == 0: logger.error( "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}" .format(fc_attr, set(attributes))) sys.exit() else: unique = set([ x[fc_attr] for k, x in dd.attributes.items() if fc_attr in x ]) logger.info( "Found {} {} entries for attribute '{}' [{} unique entries]" .format(S, fc_attr, fc_type, len(unique))) if S != len(unique): logger.warning( "Attribute non-unique. Feature counts should handle it" ) if options.feature_counts_extra_attributes: for extra_attr in cfg.feature_counts.extra_attributes.split( ","): if extra_attr not in set(attributes): logger.error( "{} not found in the GFF attributes. Try one of {}" .format(extra_attr, set(attributes))) sys.exit() # finalise the command and save it; copy the snakemake. update the config # file and save it. manager.teardown() # need to move the custom file into the working directoty try: # option added in latest version if cfg.general.custom_gff: shutil.copy(cfg.general.custom_gff, options.workdir) except: pass if options.run: subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
def _extract_head_gz(self, N, output_filename="test.fastq.gz", level=6, CHUNKSIZE=65536): """ If input is compressed: if output not compressed, this is 20% faster than "zcat file | head -1000000 > output.fastq If output is compressed, this is 3-4 times faster than : "zcat file | head -1000000 | gzip > output.fastq If input is compressed: if output not compressed, this is 10 times slower than "head -1000000 > output.fastq If output is compressed, this is 3-4 times faster than : "head -1000000 | gzip > output.fastq Tested with Python 3.5 , Linux box. """ # make sure N is integer N = int(N) # as fast as zcat file.fastq.gz | head -200000 > out.fastq # this is to supress the header decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) # will we gzip the output file ? output_filename, tozip = self._istozip(output_filename) with open(self.filename, 'rb') as fin: buf = fin.read(CHUNKSIZE) count = 0 with open(output_filename, "wb") as fout: while buf: outstr = decoder.decompress(buf) if len(outstr) == 0: msg = "Error while decompressing the zip file. may need"+\ "to dezip/rezip the data. known issue in extract_head" logger.error(msg) raise ValueError(msg) this_count = outstr.count(b"\n") if count + this_count > N: # there will be too many lines, we need to select a subset missing = N - count #outstr = outstr.strip().split(b"\n") #Fix https://github.com/sequana/sequana/issues/536 outstr = outstr.split(b"\n") outstr = b"\n".join(outstr[0:missing]) + b"\n" fout.write(outstr) break else: count += this_count fout.write(outstr) buf = fin.read(CHUNKSIZE) if tozip is True: self._gzip(output_filename) return count
def __init__(self, filename_fastq, fof_databases, threads=1, output_directory="./kraken_hierarchical/", keep_temp_files=False, force=False): """.. rubric:: **constructor** :param filename_fastq: FastQ file to analyse :param fof_databases: file that contains a list of databases paths (one per line). The order is important. Note that you may also provide a list of datab ase paths. :param threads: number of threads to be used by Kraken :param output_directory: name of the output directory :param keep_temp_files: bool, if True, will keep intermediate files from each Kraken analysis, and save html report at each step :param bool force: if the output directory already exists, the instanciation fails so that the existing data is not overrwritten. If you wish to overwrite the existing directory, set this parameter to True. """ # When running kraken in paired mode and saving the unclassified reads # in a file, the output file (fastq) contains both R1 and R2 so there # are concatenated in the same file. Actually, if there is R1 and R2, # there are concatenated as R1 N R2 (with the letter N as link). # So, in the hiearchical search, paired case, the first iteration has 2 # input files, must subsequent iterations will have only one file as # input, that is the output of the previous run (provided by # --unclassified-out option) self.filename_fastq = filename_fastq # input databases may be stored in a file if isinstance(fof_databases, str) and os.path.exists(fof_databases): with open(fof_databases, 'r') as fof: self.databases = [ absolute_path.split('\n')[0] for absolute_path in fof.readlines() ] # or simply provided as a list elif isinstance(fof_databases, list): self.databases = fof_databases[:] else: raise TypeError("input databases must be a list of valid kraken " "databases or a file (see documebntation)") self.threads = threads self.output_directory = output_directory self.keep_temp_files = keep_temp_files # check if the output directory already exist try: os.mkdir(output_directory) except OSError: if os.path.isdir(output_directory) and force is False: logger.error('Output directory %s already exists' % output_directory) raise Exception elif force is True: logger.warning("Output directory %s already exists. You may " "overwrite existing results" % output_directory) # list of input fastq files if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]: self.inputs = filename_fastq[:] elif isinstance(filename_fastq, str): self.inputs = [filename_fastq] else: msg = "input file must be a string or list of 2 filenames" msg += "\nYou provided {}".format(filename_fastq) raise TypeError(msg)
def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data/data.sum()*100 assert threshold > 0 and threshold < 100 others = data[data<threshold].sum() data = data[data>threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10,8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def main(args=None): """Mostly checking the options provided by the user and then call :func:`sequana_init` function to create the pre-filled config file + snakemake + README +runme.sh in a dedicated project directory. """ import sequana if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: sa = Tools() sa.purple("Welcome to Sequana standalone application") logger.critical("You must use --pipeline <valid pipeline name>\nuse " "--show-pipelines or --help for more information. ") return else: options = user_options.parse_args(args[1:]) # these imports must be local. This also speed up the --help sa = Tools(verbose=options.verbose) sa.purple("Welcome to Sequana standalone application") # Those options are mutually exclusive flag = int( "%s%s%s%s%s%s" % (int(bool(options.issue)), int(bool(options.version)), int(bool(options.info)), int(bool(options.show_pipelines)), int(bool(options.pipeline)), int(bool(options.get_config))), 2) if flag not in [1, 2, 4, 8, 16, 3, 32]: logger.critical("You must use one of --pipeline, --info, " "--show-pipelines, --issue, --version, --get-config") sys.exit(1) # OPTIONS that gives info and exit if options.issue: onweb('https://github.com/sequana/sequana/issues') return if options.version: sa.purple("Sequana version %s" % sequana.version) return if options.show_pipelines: sa.purple("Valid pipeline names:") for this in sorted(valid_pipelines): m = Module(this) sa.green(" - " + this) print(textwrap(m.overview, indent=8)) return if options.info: module = Module(options.info) module.onweb() return if options.pipeline: # check validity of the pipeline name if options.pipeline not in valid_pipelines: txt = "".join([" - %s\n" % this for this in valid_pipelines]) logger.critical("%s not a valid pipeline name. Use of one:\n" % options.pipeline + txt) sys.exit(1) # copy locally the request config file from a specific pipeline if flag == 3: #--get-config and --pipeline used module = Module(options.pipeline) copy_config_from_sequana(module) return # pipeline should be defined by now. Let us start the real work here Module("dag").check("warning") Module(options.pipeline).check("warning") # If user provides file1 and/or file2, check the files exist if options.file1 and os.path.exists(options.file1) is False: raise ValueError("%s does not exist" % options.file1) if options.file2 and os.path.exists(options.file2) is False: raise ValueError("%s does not exist" % options.file2) if options.kraken and os.path.exists(options.kraken) is False: raise ValueError("%s does not exist" % options.kraken) if options.input_directory and os.path.exists( options.input_directory) is False: raise ValueError("%s does not exist" % options.input_directory) # check valid combo of arguments flag = int( "%s%s%s%s%s" % ( int(bool(options.pattern)), int(bool(options.input_directory)), int(bool(options.file1)), int(bool(options.file2)), int(bool(options.config)), ), 2) # config file has flag 1, others have flag 2,4,8,16 # config file alone : 1 # --input-directory alone: 2 # --file1 alone: 4 # --file1 + --file2 : 2+4=6 # --input-pattern alone: 16 # none of those options redirect to input_directory=local if flag not in [0, 1, 2, 4, 6, 8, 16]: logger.critical(help_input + "\n\nUse --help for more information") sys.exit(1) assert options.extension in ["fastq", "fq", "fastq.gz", "fq.gz", "bam"] # Note that we use abspath to make it more robust and easier to debug # If no options, we use input_directory and set it to "." if flag == 0 or options.input_directory: if flag == 0: options.input_directory = "." options.input_directory = os.path.abspath(options.input_directory) data = options.input_directory + os.sep + "*" + options.extension options.file1 = "" options.file2 = "" options.pattern = "" if options.verbose: logger.info("Looking for sample files matching %s" % data) elif options.pattern: options.pattern = os.path.abspath(options.pattern) data = os.path.abspath(options.pattern) options.input_directory = "" options.extension = "" options.file1 = "" options.file2 = "" elif options.config: pass elif options.file1: data = [options.file1] options.file1 = os.path.abspath(options.file1) if options.file2: data = [options.file2] options.file2 = os.path.abspath(options.file2) options.input_directory = "" options.pattern = "" options.extension = "" if options.extension == 'bam' or options.pattern.endswith('bam') or \ options.pattern.endswith('bed'): ff = FileFactory(data) else: ff = FastQFactory(data, read_tag=options.input_readtag, verbose=options.verbose) if options.pipeline == 'quality_control' or options.pipeline == 'rnaseq': # check combo flag = int( "%s%s%s%s%s" % (int(bool(options.no_adapters)), int(bool(options.design)), int(bool(options.adapters)), int(bool( options.adapter_fwd)), int(bool(options.adapter_rev))), 2) if flag not in [16, 12, 6, 4, 2, 3]: logger.critical( "You must use a design experimental file using --design" " and --adapters to indicate the type of adapters (PCRFree" " or Nextera), or provide the adapters directly as a " " string (or a file) using --adapter_fwd (AND --adapter_" "rev for paired-end data). A third way is to set --adapters" " to either Nextera, PCRFree, Rubicon or universal in which case " " all adapters will be used (slower). Finally, you may use " " --no-adapters for testing purpose or if you know there " " is no adapters") sys.exit(1) # flag 12 (design + adapters when wrong args provided) if options.design and options.adapters not in adapters_choice: raise ValueError( "When using --design, you must also " "provide the type of adapters using --adapters (set to " "one of %s )" % adapters_choice) if options.design and options.adapters: from sequana import FindAdaptersFromDesign fa = FindAdaptersFromDesign(options.design, options.adapters) fa.check() # flag 12 (design + adapters with correct args) elif options.design and options.adapters in adapters_choice: options.adapters_fwd = options.adapters options.adapters_rev = options.adapters elif options.no_adapters: options.adapter_fwd = "XXXX" options.adapter_rev = "XXXX" else: if options.adapter_fwd is None: if options.adapters not in ["universal"] + adapters_choice: msg = "Incorrect adapter choice %s. " % options.adapters msg += "Correct values are :\n" for this in ['universal'] + adapters_choice: msg += " - {}\n ".format(this) logger.error(msg) raise ValueError # flag 4 if options.adapters == "universal": options.adapter_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC" options.adapter_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA" # flag 4 else: # Let the pipeline handle the names options.adapter_fwd = options.adapters options.adapter_rev = options.adapters # flag 2/3 else: if options.adapter_fwd: # Could be a string or a file. If a file, check the format if os.path.exists(options.adapter_fwd): AdapterReader(options.adapter_fwd) options.adapter_fwd = "file:%s" % options.adapter_fwd if options.adapter_rev: # Could be a string or a file. If a file, check the format if os.path.exists(options.adapter_rev): AdapterReader(options.adapter_rev) options.adapter_rev = "file:%s" % options.adapter_rev if options.design: # Just check the format adapter_finder = FindAdaptersFromDesign(options.design, options.adapters) # If all options are valid, we can now create the tree structure sequana_init(options)
def __init__(self, filename_fastq, fof_databases, threads=1, output_directory="./kraken_hierarchical/", keep_temp_files=False, force=False): """.. rubric:: **constructor** :param filename_fastq: FastQ file to analyse :param fof_databases: file that contains a list of databases paths (one per line). The order is important. Note that you may also provide a list of datab ase paths. :param threads: number of threads to be used by Kraken :param output_directory: name of the output directory :param keep_temp_files: bool, if True, will keep intermediate files from each Kraken analysis, and save html report at each step :param bool force: if the output directory already exists, the instanciation fails so that the existing data is not overrwritten. If you wish to overwrite the existing directory, set this parameter to True. """ # When running kraken in paired mode and saving the unclassified reads # in a file, the output file (fastq) contains both R1 and R2 so there # are concatenated in the same file. Actually, if there is R1 and R2, # there are concatenated as R1 N R2 (with the letter N as link). # So, in the hiearchical search, paired case, the first iteration has 2 # input files, must subsequent iterations will have only one file as # input, that is the output of the previous run (provided by # --unclassified-out option) self.filename_fastq = filename_fastq # input databases may be stored in a file if isinstance(fof_databases, str) and os.path.exists(fof_databases): with open(fof_databases, 'r') as fof: self.databases = [absolute_path.split('\n')[0] for absolute_path in fof.readlines()] # or simply provided as a list elif isinstance(fof_databases, list): self.databases = fof_databases[:] else: raise TypeError("input databases must be a list of valid kraken " "databases or a file (see documebntation)") self.threads = threads self.output_directory = output_directory self.keep_temp_files = keep_temp_files # check if the output directory already exist try: os.mkdir(output_directory) except OSError: if os.path.isdir(output_directory) and force is False: logger.error('Output directory %s already exists' % output_directory) raise Exception elif force is True: logger.warning("Output directory %s already exists. You may " "overwrite existing results" % output_directory) # list of input fastq files if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]: self.inputs = filename_fastq[:] elif isinstance(filename_fastq, str): self.inputs = [filename_fastq] else: msg = "input file must be a string or list of 2 filenames" msg += "\nYou provided {}".format(filename_fastq) raise TypeError(msg)