Esempio n. 1
0
    def copy_file(self, filename, target_dir):
        """ Copy a file to a target directory in report dir. Return the
        relative path of your file.

        :param str filename: file to copy.
        :param str target_dir: directory where to copy.

        Return relative path of the new file location.
        """
        directory = config.output_dir + os.sep + target_dir
        try:
            os.makedirs(directory)
        except FileExistsError:
            if os.path.isdir(directory):
                pass
            else:
                msg = "{0} exist and it is not a directory".format(directory)
                logger.error(msg)
                raise FileExistsError
        try:
            shutil.copy(filename, directory)
        except FileNotFoundError:
            msg = "{0} doesn't exist".format(filename)
            raise FileNotFoundError 
        return target_dir + os.sep + os.path.basename(filename)
Esempio n. 2
0
    def __init__(self, cutadapt_log, sample_name, output_filename=None):
        """
        :param input:
        """
        super().__init__()
        # Expected input data is the cutadapt log file
        if os.path.exists(cutadapt_log) is False:
            logger.error("This file {} does not exist".format(cutadapt_log))
        self.input_filename = cutadapt_log
        self.sample_name = sample_name

        self.jinja = {}
        self.data = {}

        atropos_log = cutadapt_log.replace(".txt", ".json")

        if os.path.exists(atropos_log):
            self.input_mode = "atropos"
            self.read_data() # store the rawdata
            self.parse_atropos(atropos_log)
        else:
            self.input_mode = "cutadapt"
            self.read_data() # store the rawdata
            self.parse_cutadapt()
            self._data_histograms = self._get_histogram_data()

        self.create_report_content()
        self.create_html(output_filename)
Esempio n. 3
0
    def __init__(self, filename, **kwargs):
        """.. rubric:: constructor

        :param str filename: a vcf file.
        :param kwargs: any arguments accepted by vcf.Reader
        """
        try:
            self.filename = filename
            filin = open(filename, "r")
            vcf.Reader.__init__(self, fsock=filin, **kwargs)
            self._get_start_index()
        except FileNotFoundError as e:
            logger.error(
                "FileNotFoundError({0}): {1}".format(e.errno, e.strerror)
            )
            raise FileNotFoundError
        # initiate filters dictionary
        self._filters_params = {
            'freebayes_score': 0,
            'frequency': 0,
            'min_depth': 0,
            'forward_depth': 0,
            'reverse_depth': 0,
            'strand_ratio': 0,
        }
        self._is_joint = self._check_if_joint()
Esempio n. 4
0
    def __init__(self, reference, log=None):
        """.. rubric:: Constructor

        :param reference: annotation reference.
        :param file_format: format of your file. ('only genbank actually')
        :param log: log file
        """
        # Check if the input file exist
        if os.path.isfile(reference):
            self.reference = reference
            self.ref_name = os.path.basename(reference).split('.')[0]
        else:
            logger.error("FileNotFoundError: The file " + reference +
                         " does not exist")
            sys.exit(1)

        # Set the log file
        self.log_file = log
        if log is not None:
            if os.path.isfile(log):
                os.remove(log)

        # Check if snpEff.config is present
        if not os.path.exists("snpEff.config"):
            self._get_snpeff_config()

        # Create custom database
        if not os.path.exists("data" + os.sep + self.ref_name + os.sep +
                              "snpEffectPredictor.bin"):
            self._add_custom_db()
        elif not self._check_database(self.ref_name):
            self._add_db_in_config()
Esempio n. 5
0
    def __init__(self,
                 annotation,
                 log=None,
                 snpeff_datadir="data",
                 fastafile=None):
        """.. rubric:: Constructor

        :param annotation: annotation reference.
        :param file_format: format of your file. ('only genbank actually')
        :param log: log file
        :param snpeff_datadir:
        :param fastafile: if a GFF is used, you must provide the FASTA input
            file as well
        """
        # Check if the input file exist
        if os.path.isfile(annotation):
            self.annotation = annotation
            self.fastafile = fastafile

            self.ref_name = os.path.basename(annotation).split('.')[0]
            if self.annotation.endswith(
                    ".genbank") or self.annotation.endswith(".gbk"):
                self.format = "gbk"
            elif self.annotation.endswith(".gff3") or self.annotation.endswith(
                    ".gff"):
                self.format = "gff3"
            else:
                logger.error("Format must be genbank or gff3")
                sys.exit(1)
        else:
            logger.error("FileNotFoundError: The file " + annotation +
                         " does not exist")
            sys.exit(1)

        # Keep data directory where everything will be saved
        self.snpeff_datadir = snpeff_datadir

        # Set the log file
        self.log_file = log
        if log is not None:
            if os.path.isfile(log):
                os.remove(log)

        # Check if snpEff.config is present
        if not os.path.exists("snpEff.config"):
            logger.info("snpEff.config file not found, creating one")
            self._get_snpeff_config()
        else:
            logger.info("snpEff.config file exists already. Using it.")

        # Create custom database
        if not os.path.exists(self.snpeff_datadir + os.sep + self.ref_name +
                              os.sep + "snpEffectPredictor.bin"):
            self._add_custom_db()
        elif not self._check_database(self.ref_name):
            self._add_db_in_config()
        else:
            logger.info("DB already added in your config and database")
Esempio n. 6
0
 def check(self):
     found = 0
     for sample in self.sample_names:
         try:
             self.get_adapters_from_sample(sample)
             found += 1
         except:
             logger.error("No index found for sample %s" % sample)
     if found == 0:
         raise ValueError("None of the sample match any of the adapters")
Esempio n. 7
0
 def _get_package_location(self):
     try:
         fullname = "sequana_{}".format(self.name)
         import pkg_resources
         info = pkg_resources.get_distribution(fullname)
         sharedir = os.sep.join(
             [info.location, "sequana_pipelines", self.name, 'data'])
     except pkg_resources.DistributionNotFound as err:
         logger.error("package provided (%s) not installed." % package)
         raise
     return sharedir
Esempio n. 8
0
    def get_roi(self):
        """Keep positions with zscore outside of the thresholds range.

        :return: a dataframe from :class:`FilteredGenomeCov`

        .. note:: depends on the :attr:`thresholds` low and high values.
        """
        features = self.bed.feature_dict
        try:
            second_high = self.thresholds.high2
            second_low = self.thresholds.low2
            query = "zscore > @second_high or zscore < @second_low"

            # in the genbank, the names appears as e.g. JB12345
            # but in the fasta or BED files, it may be something like
            # gi|269939526|emb|FN433596.1|
            # so they do not match. We can try to guess it
            alternative = None

            if features:
                if self.chrom_name not in features.keys():
                    msg = """Chromosome name (%s) not found
                        in the genbank. Make sure the chromosome names in
                        the BAM/BED files are compatible with the genbank
                        content. Genbank files contains the following keys """
                    for this in features.keys():
                        msg += "\n                        - %s" % this

                    alternative = [x for x in self.chrom_name.split("|") if x]
                    alternative = alternative[-1] # assume the accession is last
                    alternative = alternative.split('.')[0] # remove version
                    if alternative in features.keys():
                        msg += "\n Guessed the chromosome name to be: %s" % alternative
                    else:
                        features = None
                    logger.warning(msg % self.chrom_name)

            if features:
                if alternative:
                    return FilteredGenomeCov(self.df.query(query), self.thresholds,
                        features[alternative])
                else:
                    return FilteredGenomeCov(self.df.query(query), self.thresholds,
                        features[self.chrom_name])
            else:
                return FilteredGenomeCov(self.df.query(query), self.thresholds)
        except KeyError:
            logger.error("Column zscore is missing in data frame.\n"
                         "You must run compute_zscore before get low coverage."
                         "\n\n", self.__doc__)
            sys.exit(1)
Esempio n. 9
0
def get_sequana_adapters(type_, direction):
    """Return path to a list of adapters in FASTA format

    :param tag: PCRFree, Rubicon, Nextera
    :param type_: fwd, rev, revcomp
    :return: path to the adapter filename

    """
    # search possible types
    registered = _get_registered_adapters()
    if type_ not in registered:
        logger.error("This adapter type (%s) is not valid" % type_)
        logger.error("choose one in %s types" % registered)
        raise ValueError

    directions = ["fwd", "rev", "revcomp"]
    if direction not in directions:
        logger.error("This kind of tag (%s) is not valid" % direction)
        logger.error("choose one in %s " % directions)
        raise ValueError
    try:
        this = sequana_data("adapters_%s_%s.fa" % (type_, direction))
        logger.warning("Rename {} (remove the adapters_ prefix)".format(this))
        return this
    except:
        return sequana_data("%s_%s.fa" % (type_, direction))
Esempio n. 10
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    from sequana.pipelines_common import SequanaManager

    # the real stuff is here.
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.level = options.level

    if options.from_project is None:
        # fill the config file with input parameters
        cfg = manager.config.config
        # There is no need for input pattern / parameters in this pipeline, just
        # the input path where fastq files are to be found.
        cfg.input_pattern = options.input_pattern
        cfg.flowcell_paths = [
            os.path.abspath(x) for x in options.flowcell_paths
        ]

        if len(cfg.flowcell_paths) == 1:
            logger.error(
                "To merge flowcells, you must provide at least two directories"
            )
            sys.exit(1)

        for path in cfg.flowcell_paths:
            manager.exists(path)

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    # No need to check for input files since the
    # input_directory / read_tag is not used in this pipeline
    manager.teardown(check_input_files=False)
Esempio n. 11
0
 def __init__(self, input_filename, **kwargs):
     """
     :param str filename: a bcf file.
     :param kwargs: any arguments accepted by VariantFile.
     """
     try:
         super().__init__(input_filename, **kwargs)
     except OSError:
         logger.error("OSError: {0} doesn't exist.".format(input_filename))
         raise OSError
     # initiate filters dictionary
     self._filters = {'freebayes_score': 0,
                      'frequency': 0,
                      'min_depth': 0,
                      'forward_depth':0,
                      'reverse_depth':0,
                      'strand_ratio': 0}
Esempio n. 12
0
    def __init__(self, filename, verbose=True, **kwargs):
        """.. rubric:: constructor

        :param str filename: a vcf file.
        :param kwargs: any arguments accepted by vcf.Reader

        """
        try:
            self.filename = filename
            filin = open(filename, "r")
            vcf.Reader.__init__(self, fsock=filin, **kwargs)
            self._get_start_index()
        except FileNotFoundError as e:
            logger.error("FileNotFoundError({0}): {1}".format(
                e.errno, e.strerror))
            raise FileNotFoundError

        if verbose:
            print("Found VCF version {}".format(self.version))
Esempio n. 13
0
    def _add_custom_db(self):
        """ Add your custom file in the local snpEff database.
        """
        # create directory and copy annotation file
        logger.info("adding custom DB using your input file(s)")
        logger.info(" - {}".format(self.annotation))
        if self.fastafile:
            logger.info(" - {}".format(self.fastafile))

        genome_dir = "data" + os.sep + self.ref_name + os.sep
        try:
            os.makedirs(genome_dir)
        except FileExistsError:
            pass

        # add new annotation file in config file
        self._add_db_in_config()

        if self.format == "gbk":
            shutil.copyfile(self.annotation, genome_dir + "genes.gbk")
            snpeff_build_line = ["snpEff", "build", "-genbank", '-v']
            snpeff_build_line += [self.ref_name]
        elif self.format == "gff3":
            shutil.copyfile(self.annotation, genome_dir + "genes.gff")
            if self.fastafile is None or not os.path.exists(self.fastafile):
                logger.error("Input file {} does not exist".format(
                    self.fastafile))
                sys.exit(1)
            shutil.copyfile(self.fastafile, genome_dir + "sequences.fa")
            snpeff_build_line = ["snpEff", "build", "-gff3", '-v']
            snpeff_build_line += [self.ref_name]

        if self.log_file:
            with open(self.log_file, "ab") as fl:
                snp_build = sp.Popen(snpeff_build_line, stderr=fl, stdout=fl)
        else:
            snp_build = sp.Popen(snpeff_build_line)
        snp_build.wait()
        rc = snp_build.returncode
        if rc != 0:
            logger.error("snpEff build return a non-zero code")
            sys.exit(rc)
Esempio n. 14
0
    def copy_requirements(self):
        # FIXME
        # code redundant with snaketools.config.copy_requirements
        if 'requirements' not in self.config.config:
            return

        for requirement in self.config.config.requirements:
            if os.path.exists(requirement):
                try:
                    shutil.copy(requirement, target)
                except:
                    pass  # the target and input may be the same
            elif requirement.startswith('http') is False:
                try:
                    logger.info('Copying {} from sequana pipeline {}'.format(
                        requirement, self.name))
                    path = self.datapath + os.sep + requirement
                    shutil.copy(path, self.workdir)
                except Exception as err:
                    print(err)
                    msg = "This requirement %s was not found in sequana."
                    logger.error(msg)
                    sys.exit(1)
Esempio n. 15
0
    def check_input_files(self, stop_on_error=True):
        # Sanity checks
        cfg = self.config.config
        filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern)
        logger.info("Found {} files matching your input  pattern ({})".format(
            len(filenames), cfg.input_pattern))

        if len(filenames) == 0:
            logger.critical(
                "Found no files with your matching pattern ({})".format(
                    cfg.input_pattern))
            if "*" not in cfg.input_pattern and "?" not in cfg.input_pattern:
                logger.critical(
                    "No wildcard used in your input pattern, please use a * or ? character"
                )
            if stop_on_error:
                sys.exit(1)

        from sequana import FastQFactory
        try:
            ff = FastQFactory(cfg.input_directory + os.sep + cfg.input_pattern,
                              read_tag=cfg.input_readtag)

            # This tells whether the data is paired or not
            if ff.paired:
                paired = "paired reads"
            else:
                paired = "single-end reads"
            logger.info(
                "Your input data seems to be made of {}".format(paired))

        except:
            logger.error(
                """Input data is not fastq-compatible with sequana pipelines. You may want to set the read_tag to empty string or None if you wish
to analyse non-fastQ files (e.g. BAM)""")
            sys.exit(1)
Esempio n. 16
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else: 
        chrom_list = [options.chromosome]
    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)


    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"])
            logger.info("    {} (starting pos: {}, ending pos: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)

    if options.skip_multiqc is False:
        logger.info("=========================")
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
Esempio n. 17
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else:
        chrom_list = [options.chromosome]

    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)

    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            end = gc.positions[this]["end"]
            start = gc.positions[this]["start"]
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end-start)
            logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)
            # logging level seems to be reset to warning somewhere
            logger.level = options.logging_level

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {} '.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
        #    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        #out, err = proc.communicate()
        #with open("multiqc.log", "w") as fout:
        #    fout.write(err.decode())
    logger.info("Done")
Esempio n. 18
0
    def download_fasta(self, filelist, output_dir=None, from_ena=True):
        """Download a FASTA (or list of)

        :param filelist: a name to find on the ENA web server OR the
            name of an accession number.

        .. warning:: The filename is named after the accession without .X number
            If there are several variant .1, .2 the later will be used. This
            should not happen if the list is properly defined. 
        """
        from bioservices import ENA
        if filelist.endswith(".txt") and os.path.exists(filelist) is False:
            logger.info(
                "Downloading list from http://www.ebi.ac.uk/genomes/%s" %
                filelist)
            data = urlopen("http://www.ebi.ac.uk/genomes/%s" %
                           filelist).readlines()
            identifiers = [x.strip().decode() for x in data]
        elif filelist == "macaca":
            identifiers = [
                "CM001276", "CM001277", "CM001278", "CM001279", "CM001280",
                "CM001281", "CM001282", "CM001283", "CM001284", "CM001285",
                "CM001286", "CM001287", "CM001288", "CM001289", "CM001290",
                "CM001291", "CM001292", "CM001293", "CM001294", "CM001295",
                "CM001296"
            ]
        elif filelist == "mus_musculus":  #19 +x+y chromosomes + 5 mitochondrion
            # could also add strain C57BL.
            identifiers = [
                "AY172335", "CM000209", "CM000210", "CM000211"
                "CM000212", "CM000213", "CM000214", "CM000215", "CM000216"
                "CM000217", "CM000218", "CM000219", "CM000220", "CM000221"
                "CM000222", "CM000223", "CM000224", "CM000225", "CM000226"
                "CM000227", "CM000228", "CM000229", "CM000225", "CM000226"
                "EF108342", "AB042432", "AY675564", "DQ874614"
            ]
        elif filelist == "worms":  # Caernorhabditis briggsae and elegans
            identifiers = [
                "AC186293", "FR847112", "FR847113", "FR847114", "FR847118",
                "FR847121", "FR847123", "BX284601", "BX284602", "BX284603",
                "BX284604", "BX284605", "BX284606"
            ]
        elif isinstance(filelist, str) and filelist in self._metadata.keys():
            name = self._metadata[filelist][0]
            logger.info(
                "Downloading list from http://www.ebi.ac.uk/genomes/%s" % name)
            data = urlopen("http://www.ebi.ac.uk/genomes/%s" %
                           name).readlines()
            identifiers = [x.strip().decode() for x in data]
        elif isinstance(filelist, list):
            identifiers = filelist[:]
        elif isinstance(filelist, str):
            # could be a single identifier or a filename (assuming a single
            # column)
            if os.path.exists(filelist):
                identifiers = [x for x in open(filelist).read().split()]
                identifiers = [x.strip() for x in identifiers]
            else:
                identifiers = [filelist]
        self._identifiers = identifiers

        self.results = self.ena_id_to_gi_number(identifiers)

        # do not use caching things this could be huge data sets.
        ena = ENA()

        if output_dir is None:
            output_dir = "."
        else:
            try:
                os.mkdir(output_dir)
            except:
                pass

        N = len(identifiers)
        pb = Progress(N)
        logger.info("Fetching all fasta from ENA")
        for i, identifier in enumerate(identifiers):
            filenames = glob.glob(output_dir + os.sep + "ENA_%s*" % identifier)

            if len(filenames) >= 1:
                pb.animate(i + 1)
                # no need to fetch and save the data it looks like...
                continue

            # download data from ENA
            data = ena.get_data(identifier, "fasta")

            # Split header and Fasta
            header, others = data.decode().split("\n", 1)

            # Source of failure:
            # - list and DB are not synchrone: e.g. some entries may be deleted
            if "suppressed" in header:
                continue
            if ">" not in header:
                continue

            # Do not use try/except since when it fails, this is a real issue
            name = header.strip(">").split(" ")[0]
            db, id_, acc = name.split("|")

            try:
                header = self.switch_header_to_gi(acc)
            except:
                logger.error("Failed for this entry:")
                logger.error(identifier)
                logger.error(header)
                logger.error(name)
                continue

            # Save to local file
            # WARNINGS: extension is .fa because kraken-build expects .fa files
            filename = "%s_%s.fa" % (db, acc.split(".")[0])
            if output_dir:
                filename = output_dir + os.sep + filename

            with open(filename, "w") as fout:
                fout.write(header + "\n" + others)
            pb.animate(i + 1)
Esempio n. 19
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline

    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger

    logger.setLevel(options.level)
    logger.name = "sequana_rnaseq"
    logger.info(f"#Welcome to sequana_rnaseq pipeline.")

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # mutually exclusive options
        if options.contaminant_file:
            cfg.general.contaminant_file = os.path.abspath(
                options.contaminant_file)
            logger.warning(
                "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored"
            )
            cfg.general.rRNA_feature = None
        else:
            cfg.general.rRNA_feature = options.rRNA_feature

        # --------------------------------------------------------- trimming
        cfg.trimming.software_choice = options.trimming_software_choice
        cfg.trimming.do = not options.disable_trimming
        qual = options.trimming_quality

        if options.trimming_software_choice in ["cutadapt", "atropos"]:
            cfg.cutadapt.tool_choice = options.trimming_software_choice
            cfg.cutadapt.fwd = options.trimming_adapter_read1
            cfg.cutadapt.rev = options.trimming_adapter_read2
            cfg.cutadapt.m = options.trimming_minimum_length
            cfg.cutadapt.mode = options.trimming_cutadapt_mode
            cfg.cutadapt.options = options.trimming_cutadapt_options  # trim Ns -O 6
            cfg.cutadapt.quality = 30 if qual == -1 else qual
        else:
            cfg.fastp.minimum_length = options.trimming_minimum_length
            cfg.fastp.quality = 15 if qual == -1 else qual
            cfg.fastp.fwd = options.trimming_adapter_read1
            cfg.fastp.rev = options.trimming_adapter_read2
            cfg.fastp.options = " --cut_tail "
            cfg.fastp.disable_quality_filtering = False
            cfg.fastp.disable_adapter_trimming = False

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc

        if options.do_rnaseqc:
            if options.rnaseqc_gtf_file is None:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no GTF"
                    " file provided; Please use --rnaseqc-gtf-file option. Switching off in your"
                    " config file and continuing. You may use 'sequana gff2gtf input.gff' to create"
                    " the gtf file")
                cfg.rnaseqc.do = False
            if options.aligner in ["salmon"]:
                logger.warning(
                    "You asked for RNA_seqc QC assessements but no"
                    " BAM will be generated by the salmon aligner. Switching off this option. "
                )
                cfg.rnaseqc.do = False

        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        cfg.rseqc.do = options.do_rseqc
        cfg.rseqc.bed_file = options.rseqc_bed_file
        # -------------------------------------------------------- RNAdiff

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "Checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3

            genome_directory = os.path.abspath(cfg.general.genome_directory)
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"

            gff = GFF3(gff_file)
            df_gff = gff.df  # This takes one minute on eukaryotes. No need to
            valid_features = gff.features  # about 3 seconds
            valid_attributes = gff.attributes  # about 10 seconds

            # first check the rRNA feature
            if (cfg["general"]["rRNA_feature"]
                    and cfg["general"]["rRNA_feature"] not in valid_features):

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(
                        valid_features))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "Checking your input GFF file and feature counts options.")
            logger.info(
                f"You chose '{fc_type}' feature and '{fc_attr}' attribute")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(",")
                gff.save_gff_filtered(features=fc_types, filename="custom.gff")
                cfg.general.custom_gff = "custom.gff"

            for fc_type in fc_types:
                S = sum(df_gff["genetic_type"] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_features))
                    sys.exit()
                else:
                    logger.info("Found {} '{}' entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("genetic_type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} '{}' entries for the attribute [{} unique entries]"
                        .format(S, fc_attr, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", "{}.sh".format(NAME)], cwd=options.workdir)
Esempio n. 20
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # ============================================== sanity checks
    if not os.path.exists(options.samplesheet):
        logger.error(f"{options.samplesheet} file does not exists")
        sys.exit(1)

    if not os.path.exists(options.bcl_directory):
        logger.error(f"{options.bcl_directory} file does not exists")
        sys.exit(1)

    # Check the sample sheet
    from sequana import iem
    try:
        samplesheet = iem.IEM(options.samplesheet)
        samplesheet.validate()
    except Exception as err:
        logger.critical(err)
        logger.critical(
            """Your sample sheet seems to be incorrect. Before running the pipeline
you will have to fix it. You may use 'sequana samplesheet --quick-fix'""")

    # NextSeq
    runparam_1 = options.bcl_directory + os.sep + "RunParameters.xml"

    # HiSeq
    runparam_2 = options.bcl_directory + os.sep + "runParameters.xml"

    if os.path.exists(runparam_1):
        runparam = runparam_1
    elif os.path.exists(runparam_2):
        runparam = runparam_2
    else:
        runparam = None
        logger.warning("RunParameters.xml or runParameters.xml file not found")

    if runparam:
        with open(runparam, "r") as fin:
            data = fin.read()
            if "NextSeq" in data and options.merging_strategy != "merge":
                if options.merging_strategy == "none_and_force":
                    msg = "This is a NextSeq. You set the --merging-strategy to"
                    msg += " none_and_force. So, we proceed with no merging strategy"
                    logger.warning(msg)
                if options.merging_strategy == "none":
                    msg = "This is a NextSeq run. You must set the "
                    msg += " --merging-strategy to 'merge'."
                    logger.warning(msg)
                    sys.exit(1)

    if options.from_project is None:
        cfg = manager.config.config
        cfg.general.input_directory = os.path.abspath(options.bcl_directory)
        cfg.bcl2fastq.threads = options.threads
        cfg.bcl2fastq.barcode_mismatch = options.mismatch
        cfg.bcl2fastq.samplesheet_file = os.path.abspath(options.samplesheet)

        from sequana.iem import IEM
        ss = IEM(cfg.bcl2fastq.samplesheet_file)
        ss.validate()

        # this is defined by the working_directory
        #cfg.bcl2fastq.output_directory = "."
        cfg.bcl2fastq.ignore_missing_bcls = not options.no_ignore_missing_bcls
        cfg.bcl2fastq.no_bgzf_compression = not options.bgzf_compression

        if options.merging_strategy == "merge":
            cfg.bcl2fastq.merge_all_lanes = True
        elif options.merging_strategy in ["none", "none_and_force"]:
            cfg.bcl2fastq.merge_all_lanes = False

        #
        if options.mars_seq:
            cfg.bcl2fastq.options = " --minimum-trimmed-read-length 15 --mask-short-adapter-reads 15 "
            if options.merging_strategy in ["merge"]:
                logger.warning(
                    "with --mars-seq option, the merging strategy should be none_and_force"
                )
                cfg.bcl2fastq.merge_all_lanes = False

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown(check_input_files=False)

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Esempio n. 21
0
 def feature_dict(self, anything):
     logger.error("AttributeError: You can't set attribute.\n"
                  "GenomeCov.feature_dict is set when"
                  "GenomeCov.genbank_filename is set.")
     sys.exit(1)
Esempio n. 22
0
 def bed(self):
     logger.error("AttributeError: You can't set the ChromosomeCov.bed. "
                  "Setting is done automatically when the class is "
                  "created.")
Esempio n. 23
0
    def plot(self,
             kind="pie",
             cmap="tab20c",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.


        .. todo:: For a future release, we could use this kind of plot 
            https://stackoverflow.com/questions/57720935/how-to-use-correct-cmap-colors-in-nested-pie-chart-in-matplotlib
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_db(list(self.taxons.index))

        # we add the unclassified only if needed
        if self.unclassified > 0:
            df.loc[-1] = ["Unclassified"] * 8

        data = self.taxons.copy()

        # we add the unclassified only if needed
        if self.unclassified > 0:
            data.loc[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100

        # everything below the threshold (1) is gather together and summarised
        # into 'others'
        others = data[data < threshold].sum()

        data = data[data >= threshold]
        names = df.loc[data.index]['name']

        data.index = names.values

        if others > 0:
            data.loc['others'] = others

        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data
Esempio n. 24
0
    def plot(self,
             kind="pie",
             cmap="copper",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100
        others = data[data < threshold].sum()
        data = data[data > threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data
Esempio n. 25
0
def main(args=None):

    if args is None:
        args = sys.argv

    # whatever needs to be called by all pipeline before the options parsing
    from sequana_pipetools.options import before_pipeline
    before_pipeline(NAME)

    # option parsing including common epilog
    options = Options(NAME, epilog=sequana_epilog).parse_args(args[1:])

    from sequana.pipelines_common import SequanaManager

    # the real stuff is here
    manager = SequanaManager(options, NAME)

    # create the beginning of the command and the working directory
    manager.setup()
    from sequana import logger
    logger.setLevel(options.level)

    # fill the config file with input parameters
    if options.from_project is None:
        cfg = manager.config.config

        # --------------------------------------------------------- general
        cfg.general.genome_directory = os.path.abspath(
            options.genome_directory)
        cfg.general.aligner = options.aligner

        # genome name = cfg.genome.genome_directory
        genome_name = cfg.general.genome_directory.rsplit("/", 1)[1]
        prefix = cfg.general.genome_directory
        fasta = cfg.general.genome_directory + f"/{genome_name}.fa"
        if os.path.exists(fasta) is False:
            logger.critical(
                """Could not find {}. You must have the genome sequence in fasta with the extension .fa named after the genome directory."""
                .format(fasta))
            sys.exit()

        # Do we need the indexing ?
        if options.aligner == "bowtie2":
            if os.path.exists(prefix + f"/bowtie2/{genome_name}.rev.1.bt2"):
                logger.info("Indexing found for {}.".format("bowtie2"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie2"))
                cfg.general.indexing = True
        elif options.aligner == "star":
            if os.path.exists(prefix + f"/star/SAindex"):
                logger.info("Indexing found for {}.".format("STAR"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "STAR"))
                cfg.general.indexing = True
        elif options.aligner == "bowtie1":
            if os.path.exists(prefix + f"/bowtie1/{genome_name}.rev.1.ebwt"):
                logger.info("Indexing found for {}.".format("bowtie1"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "bowtie1"))
                cfg.general.indexing = True
        elif options.aligner == "salmon":
            if os.path.exists(cfg.general.genome_directory +
                              "/salmon/salmon.done"):
                logger.info("Indexing found for {}.".format("salmon"))
                cfg.general.indexing = False
            else:
                logger.info(
                    "Indexing not found for {}. Planned to be run".format(
                        "salmon"))
                cfg.general.indexing = True

        #options.do_indexing
        cfg.general.force_indexing = options.force_indexing
        cfg.general.rRNA_feature = options.rRNA_feature
        cfg.general.contaminant_file = options.contaminant_file

        if options.rRNA_feature and options.contaminant_file:
            logger.warning(
                "You are using --contaminant_file so --rRNA-feature will be ignored (we search for contaminant in the input file; not rRNA in the gff file"
            )
            sys.exit(1)

        # --------------------------------------------------------- cutadapt
        cfg.cutadapt.do = not options.skip_cutadapt
        manager.update_config(cfg, options, "cutadapt")

        # ----------------------------------------------------  others
        cfg.input_directory = os.path.abspath(options.input_directory)
        cfg.input_pattern = options.input_pattern
        cfg.input_readtag = options.input_readtag

        # ----------------------------------------------------- feature counts
        cfg.feature_counts.options = options.feature_counts_options
        cfg.feature_counts.strandness = options.feature_counts_strandness
        cfg.feature_counts.attribute = options.feature_counts_attribute
        cfg.feature_counts.feature = options.feature_counts_feature_type
        cfg.feature_counts.extra_attributes = options.feature_counts_extra_attributes

        # ------------------------------------------------------ optional
        cfg.igvtools.do = options.do_igvtools
        cfg.coverage.do = options.do_bam_coverage
        cfg.mark_duplicates.do = False
        if options.do_mark_duplicates:
            cfg.mark_duplicates.do = True

        # -------------------------------------------------------- RNAseqQC
        cfg.rnaseqc.do = options.do_rnaseqc
        cfg.rnaseqc.gtf_file = options.rnaseqc_gtf_file

        # -------------------------------------------------------- RNAdiff
        cfg.rnadiff.mode = options.rnadiff_mode

        import sequana_pipelines.rnaseq

        # SANITY CHECKS
        # -------------------------------------- do we find rRNA feature in the GFF ?
        # if we do not build a custom feature_counts set of options, no need to
        # check carfully the GFF; if users knows what he is doing; no need to
        # check the GFF either
        if options.skip_gff_check is False and "," not in cfg.feature_counts.feature:
            logger.info(
                "checking your input GFF file and rRNA feature if provided")

            from sequana.gff3 import GFF3
            genome_directory = os.path.abspath(
                cfg["general"]["genome_directory"])
            genome_name = genome_directory.rsplit("/", 1)[1]
            prefix_name = genome_directory + "/" + genome_name
            gff_file = prefix_name + ".gff"
            gff = GFF3(gff_file)
            df_gff = gff.get_df()
            valid_types = gff.get_types()

            # first check the rRNA feature
            if cfg['general']["rRNA_feature"] and \
                cfg['general']["rRNA_feature"] not in valid_types:

                logger.error(
                    "rRNA feature not found in the input GFF ({})".format(
                        gff_file) +
                    " This is probably an error. Please check the GFF content and /or"
                    " change the feature name with --rRNA-feature based on the content"
                    " of your GFF. Valid features are: {}".format(valid_types))
                sys.exit()

            # then, check the main feature
            fc_type = cfg.feature_counts.feature
            fc_attr = cfg.feature_counts.attribute

            logger.info(
                "checking your input GFF file and feature counts options")
            # if only one feature (99% of the projet)
            if "," not in fc_type:
                fc_types = [fc_type]
            else:
                logger.info(
                    "Building a custom GFF file (custom.gff) using Sequana. Please wait"
                )
                fc_types = fc_type.split(',')
                gff.save_gff_filtered(features=fc_types, filename='custom.gff')
                cfg.general.custom_gff = 'custom.gff'

            for fc_type in fc_types:
                S = sum(df_gff['type'] == fc_type)
                if S == 0:
                    logger.error(
                        "Found 0 entries for feature '{}'. Please choose a valid feature from: {}"
                        .format(fc_type, valid_types))
                    sys.exit()
                else:
                    logger.info("Found {} {} entries".format(S, fc_type))

                # now we check the attribute:
                dd = df_gff.query("type==@fc_type")
                attributes = [y for x in dd.attributes for y in x.keys()]
                S = attributes.count(fc_attr)
                if S == 0:
                    logger.error(
                        "Found 0 entries for attribute '{}'. Please choose a valid attribute from: {}"
                        .format(fc_attr, set(attributes)))
                    sys.exit()
                else:
                    unique = set([
                        x[fc_attr] for k, x in dd.attributes.items()
                        if fc_attr in x
                    ])
                    logger.info(
                        "Found {} {} entries for attribute '{}' [{} unique entries]"
                        .format(S, fc_attr, fc_type, len(unique)))

                if S != len(unique):
                    logger.warning(
                        "Attribute non-unique. Feature counts should handle it"
                    )

                if options.feature_counts_extra_attributes:
                    for extra_attr in cfg.feature_counts.extra_attributes.split(
                            ","):
                        if extra_attr not in set(attributes):
                            logger.error(
                                "{} not found in the GFF attributes. Try one of {}"
                                .format(extra_attr, set(attributes)))
                            sys.exit()

    # finalise the command and save it; copy the snakemake. update the config
    # file and save it.
    manager.teardown()
    # need to move the custom file into the working directoty
    try:  # option added in latest version
        if cfg.general.custom_gff:
            shutil.copy(cfg.general.custom_gff, options.workdir)
    except:
        pass

    if options.run:
        subprocess.Popen(["sh", '{}.sh'.format(NAME)], cwd=options.workdir)
Esempio n. 26
0
    def _extract_head_gz(self, N, output_filename="test.fastq.gz", level=6, CHUNKSIZE=65536):
        """

        If input is compressed:

            if output not compressed, this is 20% faster than
            "zcat file | head -1000000 > output.fastq

            If output is compressed, this is 3-4 times faster than :
            "zcat file | head -1000000 | gzip > output.fastq

        If input is compressed:

            if output not compressed, this is 10 times slower than
            "head -1000000 > output.fastq

            If output is compressed, this is 3-4 times faster than :
            "head -1000000 | gzip > output.fastq

        Tested with Python 3.5 , Linux box.
        """
        # make sure N is integer
        N = int(N)

        # as fast as zcat file.fastq.gz | head -200000 > out.fastq

        # this is to supress the header
        decoder = zlib.decompressobj(16 + zlib.MAX_WBITS)

        # will we gzip the output file ?
        output_filename, tozip = self._istozip(output_filename)

        with open(self.filename, 'rb') as fin:
            buf = fin.read(CHUNKSIZE)
            count = 0

            with open(output_filename, "wb") as fout:
                while buf:
                    outstr = decoder.decompress(buf)
                    if len(outstr) == 0:
                        msg = "Error while decompressing the zip file. may need"+\
                              "to dezip/rezip the data. known issue in extract_head"
                        logger.error(msg)
                        raise ValueError(msg)
                    this_count = outstr.count(b"\n")
                    if count + this_count > N:
                        # there will be too many lines, we need to select a subset
                        missing = N - count
                        #outstr = outstr.strip().split(b"\n")
                        #Fix https://github.com/sequana/sequana/issues/536
                        outstr = outstr.split(b"\n")
                        outstr = b"\n".join(outstr[0:missing]) + b"\n"
                        fout.write(outstr)
                        break
                    else:
                        count += this_count
                    fout.write(outstr)
                    buf = fin.read(CHUNKSIZE)

        if tozip is True: self._gzip(output_filename)
        return count
Esempio n. 27
0
    def __init__(self,
                 filename_fastq,
                 fof_databases,
                 threads=1,
                 output_directory="./kraken_hierarchical/",
                 keep_temp_files=False,
                 force=False):
        """.. rubric:: **constructor**

        :param filename_fastq: FastQ file to analyse
        :param fof_databases: file that contains a list of databases paths 
            (one per line). The order is important. Note that you may also
            provide a list of datab ase paths.
        :param threads: number of threads to be used by Kraken
        :param output_directory: name of the output directory
        :param keep_temp_files: bool, if True, will keep intermediate files
            from each Kraken analysis, and save html report at each step
        :param bool force: if the output directory already exists, the
            instanciation fails so that the existing data is not overrwritten. 
            If you wish to overwrite the existing directory, set this 
            parameter to True.
        """
        # When running kraken in paired mode and saving the unclassified reads
        # in a file, the output file (fastq) contains both R1 and R2 so there
        # are concatenated in the same file. Actually, if there is R1 and R2,
        # there are concatenated as R1 N R2 (with the letter N as link).
        # So, in the hiearchical search, paired case, the first iteration has 2
        # input files, must subsequent iterations will have only one file as
        # input, that is the output of the previous run (provided by
        # --unclassified-out option)
        self.filename_fastq = filename_fastq

        # input databases may be stored in a file
        if isinstance(fof_databases, str) and os.path.exists(fof_databases):
            with open(fof_databases, 'r') as fof:
                self.databases = [
                    absolute_path.split('\n')[0]
                    for absolute_path in fof.readlines()
                ]
        # or simply provided as a list
        elif isinstance(fof_databases, list):
            self.databases = fof_databases[:]
        else:
            raise TypeError("input databases must be a list of valid kraken "
                            "databases or a file (see documebntation)")
        self.threads = threads
        self.output_directory = output_directory
        self.keep_temp_files = keep_temp_files

        # check if the output directory already exist
        try:
            os.mkdir(output_directory)
        except OSError:
            if os.path.isdir(output_directory) and force is False:
                logger.error('Output directory %s already exists' %
                             output_directory)
                raise Exception
            elif force is True:
                logger.warning("Output directory %s already exists. You may "
                               "overwrite existing results" % output_directory)

        # list of input fastq files
        if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]:
            self.inputs = filename_fastq[:]
        elif isinstance(filename_fastq, str):
            self.inputs = [filename_fastq]
        else:
            msg = "input file must be a string or list of 2 filenames"
            msg += "\nYou provided {}".format(filename_fastq)
            raise TypeError(msg)
Esempio n. 28
0
    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data
Esempio n. 29
0
def main(args=None):
    """Mostly checking the options provided by the user and then call
    :func:`sequana_init` function to create the pre-filled config file +
    snakemake + README +runme.sh in a dedicated project directory.

    """
    import sequana
    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        sa = Tools()
        sa.purple("Welcome to Sequana standalone application")
        logger.critical("You must use --pipeline <valid pipeline name>\nuse "
                        "--show-pipelines or --help for more information. ")
        return
    else:
        options = user_options.parse_args(args[1:])

    # these imports must be local. This also speed up the --help

    sa = Tools(verbose=options.verbose)
    sa.purple("Welcome to Sequana standalone application")

    # Those options are mutually exclusive
    flag = int(
        "%s%s%s%s%s%s" %
        (int(bool(options.issue)), int(bool(options.version)),
         int(bool(options.info)), int(bool(options.show_pipelines)),
         int(bool(options.pipeline)), int(bool(options.get_config))), 2)
    if flag not in [1, 2, 4, 8, 16, 3, 32]:
        logger.critical("You must use one of --pipeline, --info, "
                        "--show-pipelines, --issue, --version, --get-config")
        sys.exit(1)

    # OPTIONS that gives info and exit
    if options.issue:
        onweb('https://github.com/sequana/sequana/issues')
        return

    if options.version:
        sa.purple("Sequana version %s" % sequana.version)
        return

    if options.show_pipelines:
        sa.purple("Valid pipeline names:")
        for this in sorted(valid_pipelines):
            m = Module(this)
            sa.green(" - " + this)
            print(textwrap(m.overview, indent=8))
        return

    if options.info:
        module = Module(options.info)
        module.onweb()
        return

    if options.pipeline:
        # check validity of the pipeline name
        if options.pipeline not in valid_pipelines:
            txt = "".join([" - %s\n" % this for this in valid_pipelines])
            logger.critical("%s not a valid pipeline name. Use of one:\n" %
                            options.pipeline + txt)
            sys.exit(1)

    # copy locally the request config file from a specific pipeline
    if flag == 3:  #--get-config and --pipeline used
        module = Module(options.pipeline)
        copy_config_from_sequana(module)
        return

    # pipeline should be defined by now. Let us start the real work here
    Module("dag").check("warning")
    Module(options.pipeline).check("warning")

    # If user provides file1 and/or file2, check the files exist
    if options.file1 and os.path.exists(options.file1) is False:
        raise ValueError("%s does not exist" % options.file1)

    if options.file2 and os.path.exists(options.file2) is False:
        raise ValueError("%s does not exist" % options.file2)

    if options.kraken and os.path.exists(options.kraken) is False:
        raise ValueError("%s does not exist" % options.kraken)

    if options.input_directory and os.path.exists(
            options.input_directory) is False:
        raise ValueError("%s does not exist" % options.input_directory)

    # check valid combo of arguments
    flag = int(
        "%s%s%s%s%s" % (
            int(bool(options.pattern)),
            int(bool(options.input_directory)),
            int(bool(options.file1)),
            int(bool(options.file2)),
            int(bool(options.config)),
        ), 2)

    # config file has flag 1, others have flag 2,4,8,16
    # config file alone : 1
    # --input-directory alone: 2
    # --file1 alone: 4
    # --file1 + --file2 : 2+4=6
    # --input-pattern alone: 16
    # none of those options redirect to input_directory=local
    if flag not in [0, 1, 2, 4, 6, 8, 16]:
        logger.critical(help_input + "\n\nUse --help for more information")
        sys.exit(1)

    assert options.extension in ["fastq", "fq", "fastq.gz", "fq.gz", "bam"]

    # Note that we use abspath to make it more robust and easier to debug
    # If no options, we use input_directory and set it to "."
    if flag == 0 or options.input_directory:
        if flag == 0:
            options.input_directory = "."
        options.input_directory = os.path.abspath(options.input_directory)
        data = options.input_directory + os.sep + "*" + options.extension
        options.file1 = ""
        options.file2 = ""
        options.pattern = ""
        if options.verbose:
            logger.info("Looking for sample files matching %s" % data)
    elif options.pattern:
        options.pattern = os.path.abspath(options.pattern)
        data = os.path.abspath(options.pattern)
        options.input_directory = ""
        options.extension = ""
        options.file1 = ""
        options.file2 = ""
    elif options.config:
        pass
    elif options.file1:
        data = [options.file1]
        options.file1 = os.path.abspath(options.file1)
        if options.file2:
            data = [options.file2]
            options.file2 = os.path.abspath(options.file2)
        options.input_directory = ""
        options.pattern = ""
        options.extension = ""

    if options.extension == 'bam' or options.pattern.endswith('bam') or \
            options.pattern.endswith('bed'):

        ff = FileFactory(data)
    else:
        ff = FastQFactory(data,
                          read_tag=options.input_readtag,
                          verbose=options.verbose)

    if options.pipeline == 'quality_control' or options.pipeline == 'rnaseq':
        # check combo
        flag = int(
            "%s%s%s%s%s" %
            (int(bool(options.no_adapters)), int(bool(options.design)),
             int(bool(options.adapters)), int(bool(
                 options.adapter_fwd)), int(bool(options.adapter_rev))), 2)

        if flag not in [16, 12, 6, 4, 2, 3]:
            logger.critical(
                "You must use a design experimental file using --design"
                " and --adapters to indicate the type of adapters (PCRFree"
                " or Nextera), or provide the adapters directly as a "
                " string (or a file) using --adapter_fwd (AND --adapter_"
                "rev for paired-end data). A third way is to set --adapters"
                " to either Nextera, PCRFree, Rubicon or universal in which case "
                " all adapters will be used (slower). Finally, you may use "
                " --no-adapters for testing purpose or if you know there "
                " is no adapters")
            sys.exit(1)

        # flag 12 (design + adapters when wrong args provided)
        if options.design and options.adapters not in adapters_choice:
            raise ValueError(
                "When using --design, you must also "
                "provide the type of adapters using --adapters (set to "
                "one of %s )" % adapters_choice)
        if options.design and options.adapters:
            from sequana import FindAdaptersFromDesign
            fa = FindAdaptersFromDesign(options.design, options.adapters)
            fa.check()

        # flag 12 (design + adapters with correct args)
        elif options.design and options.adapters in adapters_choice:
            options.adapters_fwd = options.adapters
            options.adapters_rev = options.adapters
        elif options.no_adapters:
            options.adapter_fwd = "XXXX"
            options.adapter_rev = "XXXX"
        else:
            if options.adapter_fwd is None:
                if options.adapters not in ["universal"] + adapters_choice:
                    msg = "Incorrect adapter choice %s. " % options.adapters
                    msg += "Correct values are :\n"
                    for this in ['universal'] + adapters_choice:
                        msg += " - {}\n ".format(this)
                    logger.error(msg)
                    raise ValueError
                # flag 4
                if options.adapters == "universal":
                    options.adapter_fwd = "GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGC"
                    options.adapter_rev = "TCTAGCCTTCTCGCAGCACATCCCTTTCTCACATCTAGAGCCACCAGCGGCATAGTAA"
                # flag 4
                else:
                    # Let the pipeline handle the names
                    options.adapter_fwd = options.adapters
                    options.adapter_rev = options.adapters
            # flag 2/3
            else:
                if options.adapter_fwd:
                    # Could be a string or a file. If a file, check the format
                    if os.path.exists(options.adapter_fwd):
                        AdapterReader(options.adapter_fwd)
                        options.adapter_fwd = "file:%s" % options.adapter_fwd
                if options.adapter_rev:
                    # Could be a string or a file. If a file, check the format
                    if os.path.exists(options.adapter_rev):
                        AdapterReader(options.adapter_rev)
                        options.adapter_rev = "file:%s" % options.adapter_rev
        if options.design:
            # Just check the format
            adapter_finder = FindAdaptersFromDesign(options.design,
                                                    options.adapters)

    # If all options are valid, we can now create the tree structure
    sequana_init(options)
Esempio n. 30
0
    def __init__(self, filename_fastq, fof_databases, threads=1,
                 output_directory="./kraken_hierarchical/", 
                 keep_temp_files=False, force=False):
        """.. rubric:: **constructor**

        :param filename_fastq: FastQ file to analyse
        :param fof_databases: file that contains a list of databases paths 
            (one per line). The order is important. Note that you may also
            provide a list of datab ase paths.
        :param threads: number of threads to be used by Kraken
        :param output_directory: name of the output directory
        :param keep_temp_files: bool, if True, will keep intermediate files
            from each Kraken analysis, and save html report at each step
        :param bool force: if the output directory already exists, the
            instanciation fails so that the existing data is not overrwritten. 
            If you wish to overwrite the existing directory, set this 
            parameter to True.
        """
        # When running kraken in paired mode and saving the unclassified reads
        # in a file, the output file (fastq) contains both R1 and R2 so there
        # are concatenated in the same file. Actually, if there is R1 and R2,
        # there are concatenated as R1 N R2 (with the letter N as link). 
        # So, in the hiearchical search, paired case, the first iteration has 2
        # input files, must subsequent iterations will have only one file as
        # input, that is the output of the previous run (provided by
        # --unclassified-out option)
        self.filename_fastq = filename_fastq

        # input databases may be stored in a file
        if isinstance(fof_databases, str) and os.path.exists(fof_databases):
            with open(fof_databases, 'r') as fof:
                self.databases = [absolute_path.split('\n')[0] for absolute_path in fof.readlines()]
        # or simply provided as a list
        elif isinstance(fof_databases, list):
            self.databases = fof_databases[:]
        else:
            raise TypeError("input databases must be a list of valid kraken "
                            "databases or a file (see documebntation)")
        self.threads = threads
        self.output_directory = output_directory
        self.keep_temp_files = keep_temp_files

        # check if the output directory already exist
        try:
            os.mkdir(output_directory)
        except OSError:
            if os.path.isdir(output_directory) and force is False:
                logger.error('Output directory %s already exists' % output_directory)
                raise Exception
            elif force is True:
                logger.warning("Output directory %s already exists. You may "
                    "overwrite existing results" % output_directory)

        # list of input fastq files
        if isinstance(filename_fastq, list) and len(filename_fastq) in [1, 2]:
            self.inputs = filename_fastq[:]
        elif isinstance(filename_fastq, str):
            self.inputs = [filename_fastq]
        else:
            msg = "input file must be a string or list of 2 filenames"
            msg += "\nYou provided {}".format(filename_fastq)
            raise TypeError(msg)