Example #1
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                                           company,
                                           verbose=False,
                                           company=company)
            except Exception as err:
                print(
                    red("Issue with %s. Continue with other companies" %
                        company))
                print(err)
            pb.animate(i + 1)

        # Finally, an index towards each company
        self._create_main_index()
Example #2
0
    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                    company, verbose=False, company=company)
            except Exception as err:
                print(red("Issue with %s. Continue with other companies" % company))
                print(err)
            pb.animate(i+1)

        # Finally, an index towards each company
        self._create_main_index()
Example #3
0
    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode,
                verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init() # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)
Example #4
0
    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init()  # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)
Example #5
0
def check_ipython_notebook():

    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i, filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i + 1)
def check_ipython_notebook():


    notebooks = glob.glob("*ipynb")
    N = len(notebooks)

    pb = Progress(N)
    for i,filename in enumerate(notebooks):
        print(purple(filename))
        notebook = read(open(filename), 'json')
        r = NotebookRunner(notebook)
        r.run_notebook()
        pb.animate(i+1)
Example #7
0
def main(args=None):
    if args is None:
        args = sys.argv[:]

    print(purple("Welcome to sequana_substractor"))
    print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL"))
    user_options = Options(prog="sequana_substractor")
    if len(args) == 1:
        args.append("--help")

    if "--version" in sys.argv:
        import sequana
        print(sequana.version)
        sys.exit(0)

    options = user_options.parse_args(args[1:])
    logger.setLevel(options.level)

    # build the references list
    references = []
    if options.reference:
        references.append(options.reference)
    if options.references:
        references = options.references
    options.references = references

    references = []
    # expand globs if any
    for ref in options.references:
        references.extend(glob.glob(ref))

    logger.info("{} references provided: {}".format(len(references),
                                                    ",".join(references)))

    # call the entire machinery here
    sub = Substractor(options.input, references, options.outdir,
                      options.mapper, options.threads)
    sub.run(options.outfile)
Example #8
0
def main(args=None):
    if args is None:
        args = sys.argv[:]

    print(purple("Welcome to sequana_bam_splitter"))
    user_options = Options(prog="sequana_bam_splitter")
    if len(args) == 1:
        args.append("--help")

    if "--version" in sys.argv:
        import sequana
        print(sequana.version)
        sys.exit(0)

    options = user_options.parse_args(args[1:])

    # set the level
    logger.level = options.level
    logger.info("This SAM/BAM/CRAM splitter is used for paired or un-paired "
                "reads with perfectly mapped or unmapped reads (flags 0, 4, "
                "16). Others are dropped.")

    logger.info("Reading {}".format(options.input))

    # What prefix used for the output filename ?
    if options.prefix is None:
        prefix = options.input.rstrip(".bam")
        prefix = "test"
    else:
        prefix = options.prefix

    if options.outdir:
        prefix = options.outdir + os.sep + prefix
        if os.path.exists(options.outdir) is False:
            from easydev import mkdirs
            logger.info("Creating {} directory".format(options.outdir))
            mkdirs(options.outdir)

    match, unmatch, flags = _main(options.input,
                                  prefix,
                                  keep_unmapped=options.keep_unmapped)

    logger.info("Matched: {}".format(match))
    logger.info("Unmatched (flag 4 and 256): {}".format(unmatch))
    logger.info("All flags: {}".format(Counter(flags)))
Example #9
0
    def  __init__(self, prog="sequana_coverage"):
        usage = purple("""\nWelcome to SEQUANA -- Coverage standalone

    Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
    file. In addition, running median used in conjunction with double thresholds
    extract regions of interests (low or high coverage). A reference may be
    provided to plot the coverage versus GC content. 

    The input file should be one of the following:

    - a BED file that is a tabulated file at least 3 columns.
      The first column being the reference, the second is the position 
      and the third column contains the coverage itself. 
    - or a BAM file that is converted automatically
      into a BED file using the following command:

        samtools depth -aa input.bam > output.bed

    If the reference is provided, an additional plot showing the coverage versus
    GC content is also shown.

    Here are some examples

        sequana_coverage --input file.bed --window-median 1001
        sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>

    An other interesting option is to provide a BED file with 4 columns. The
    fourth column being another coverage data created with a filter. One can
    create such a file only from the BAM file using samtools as follows given
    the original unfiltered BAM file named input.bam:

        samtools view -q 35  -o data.filtered.bam input.bam
        samtools depth input.bam data.filtered.bam  -aa > test.bed
        sequana_coverage --input test.bed --show-html

    Note that the first file is the filtered one, and the second file is the
    unfiltered one.


    Note for multi chromosome and genbank features: for now, you will need to call 
    sequana_coverage for each chromosome individually since we accept only one
    genbank as input parameter:

        sequana_coverage --input file.bed --genbank chrom1.gbk -c 1

    chromosome order in the BED and 

        """)

        epilog = purple("""
----

AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """)

        description = """DESCRIPTION:
        """

        super(Options, self).__init__(usage=usage, prog=prog,
                description=description, epilog=epilog,
                formatter_class=CustomFormatter)

        # options to fill the config file
        group = self.add_argument_group("Required argument")
        group.add_argument("-i", "--input", dest="input", type=str,
            help=("Input file in BED or BAM format. If a BAM file is "
                 "provided, it will be converted locally to a BED file "
                 "using genomecov, which must be installed."))

        group = self.add_argument_group("Optional biological arguments")
        group.add_argument(
            '-c', "--chromosome", dest="chromosome", type=int, default=0,
            help="Chromosome number (if only one, no need to use: the single"
                 " chromosome is chosen automatically). Default is "
                 " first chromosome found in the BED file. You may want to"
                 " analyse all chromosomes at the same time. If so, set this"
                 " parameter to 0")
        group.add_argument('-o', "--circular", dest="circular",
            default=False, action="store_true",
            help="""If the DNA of the organism is circular (typically
            viruses or bacteria), set to True""")

        group = self.add_argument_group("General")
        group.add_argument("--output-directory", dest="output_directory",
            default="report", help="name of the output (report) directory.")
        group.add_argument('--show-html', dest="show_html", default=False,
            action='store_true',
            help="""When report is created, you can open
            the main page automatically with this option (default is False)""")
        group.add_argument("-q", "--quiet", dest="verbose",
            default=True, action="store_false")
        group.add_argument('--no-report', dest="create_report",
            default=True, action='store_false',
            help="""Do not create any HTML report""")
        group.add_argument("--logging-level", dest="logging_level",
            default="INFO",
            help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR")
        group = self.add_argument_group('Annotation')
        group.add_argument("-b", "--genbank", dest="genbank",
            type=str, default=None, help='a valida genbank annotation')

        # Group related to GC content
        group = self.add_argument_group("GC content related")
        group.add_argument('-r', "--reference", dest="reference", type=str,
            default=None,
            help="""If available, you can provide a reference (ENA/NCBI). It
                 must have the same length as the one used to create the
                 BAM or BED file. If provided, it is used to create the
                 coverage versus GC content image""")
        group.add_argument(
            "-g", "--window-gc", dest="w_gc", type=int, default=201,
            help="""Length of the running window to compute the GC content""")
        group.add_argument('-n', "--nlevels", dest="levels", type=int,
            default=3, help="""Number of levels in the contour""")

        #group running median
        group = self.add_argument_group("Running Median related")
        group.add_argument("-w", "--window-median", dest="w_median", type=int,
            help="""Length of the running median window (default 4001,
                 recommended for viruses).  For long genome, 20001
                 or 30001 is recommended but larger windows may be
                 useful in the presence of long deleted regions.""",
            default=4001)

        group.add_argument("-k", "--mixture-models", dest="k", type=int,
            help="""Number of mixture models to use (default 2, although if sequencing
        depth is below 8, k is set to 1 automatically). To ignore that behaviour
        set k to the required value""",
            default=None)

        group.add_argument("-L", "--low-threshold", dest="low_threshold",
            default=None, type=float,
            help=("lower threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-H", "--high-threshold", dest="high_threshold",
            default=None, type=float,
            help=("higher threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-T", "--threshold", dest="threshold",
            default=4, type=float,
            help="""set lower and higher thresholds of the confidence interval.""")

        group = self.add_argument_group("Download reference")
        group.add_argument("--download-reference", dest="download_reference",
            default=None, type=str)
        group.add_argument("--download-genbank", dest="download_genbank",
            default=None, type=str)
        group.add_argument("--database", dest="database",
            default="ENA", type=str,
            choices=["ENA", "EUtils"],
            help="Download the reference from one of these database (default ENA)")
Example #10
0
    def  __init__(self, prog="sequana_coverage"):
        usage = purple("""\nWelcome to SEQUANA -- Coverage standalone

    Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
    file. In addition, running median used in conjunction with double thresholds
    extract regions of interests (low or high coverage). A reference may be
    provided to plot the coverage versus GC content.

    The input file should be one of the following:

    - a BED file that is a tabulated file at least 3 columns.
      The first column being the reference, the second is the position
      and the third column contains the coverage itself.
    - or a BAM file that is converted automatically
      into a BED file using the following command:

        samtools depth -aa input.bam > output.bed

    If the reference is provided, an additional plot showing the coverage versus
    GC content is also shown.

    Here are some examples

        sequana_coverage --input file.bed --window-median 1001
        sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>

    An other interesting option is to provide a BED file with 4 columns. The
    fourth column being another coverage data created with a filter. One can
    create such a file only from the BAM file using samtools as follows given
    the original unfiltered BAM file named input.bam:

        samtools view -q 35  -o data.filtered.bam input.bam
        samtools depth input.bam data.filtered.bam  -aa > test.bed
        sequana_coverage --input test.bed --show-html

    Note that the first file is the filtered one, and the second file is the
    unfiltered one.

    Note for multi chromosome and genbank features: for now, you will need to call
    sequana_coverage for each chromosome individually since we accept only one
    genbank as input parameter:

        sequana_coverage --input file.bed --genbank chrom1.gbk -c 1

    Large genomes:
    --------------

    If your input data is large and does not fit into memory, use the --binning BIN
    options to average data into bin of BIN values.

    CNV cases:
    --------------

    By default, sequana_coverage identify events as small as 1 bin. For the CNV
    detection case, you may want to cluster events. the --cnv-merging DELTA option
    merges consecutives events whose distance is smaller that DELTA


        """)

        epilog = purple("""
----

AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """)

        description = """DESCRIPTION:
        """

        super(Options, self).__init__(usage=usage, prog=prog,
                description=description, epilog=epilog,
                formatter_class=CustomFormatter)

        # options to fill the config file
        group = self.add_argument_group("Required argument")
        group.add_argument("-i", "--input", dest="input", type=str,
            help=("Input file in BED or BAM format. If a BAM file is "
                 "provided, it will be converted locally to a BED file "
                 "using genomecov, which must be installed."))

        group = self.add_argument_group("Optional biological arguments")
        group.add_argument(
            '-c', "--chromosome", dest="chromosome", type=int, default=-1,
            help="Chromosome number (if only one chromosome found, the single"
                 " chromosome is chosen automatically). Otherwise all "
                 "chromosomes are analysed. You may want to analyse only one"
                 " in which case, use this parameter (e.g., -c 1). "
                "!!START AT INDEX 0 !!")
        group.add_argument('-o', "--circular", dest="circular",
            default=False, action="store_true",
            help="""If the DNA of the organism is circular (typically
            viruses or bacteria), set to True""")

        group = self.add_argument_group("General")
        group.add_argument("--output-directory", dest="output_directory",
            default="report", help="name of the output (report) directory.")
        group.add_argument("-q", "--quiet", dest="verbose",
            default=True, action="store_false")
        group.add_argument('--no-html', dest="skip_html",
            default=False, action='store_true',
            help="""Do not create any HTML reports. Save ROIs and statistics only.""")
        group.add_argument('--no-multiqc', dest="skip_multiqc",
            default=False, action='store_true',
            help="""Do not create any multiqc HTML page.""")
        group.add_argument("--debug-level", dest="logging_level",
            default="INFO",
            help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR")
        group.add_argument("--level", dest="logging_level",
            default="INFO",
            help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR")
        group = self.add_argument_group('Annotation')
        group.add_argument("-b", "--genbank", dest="genbank",
            type=str, default=None, help='a valid genbank annotation')

        # Group related to GC content
        group = self.add_argument_group("GC content related")
        group.add_argument('-r', "--reference", dest="reference", type=str,
            default=None,
            help="""If available, you can provide a reference (ENA/NCBI). It
                 must have the same length as the one used to create the
                 BAM or BED file. If provided, it is used to create the
                 coverage versus GC content image""")
        group.add_argument(
            "-g", "--window-gc", dest="w_gc", type=int, default=201,
            help="""Length of the running window to compute the GC content""")
        group.add_argument('-n', "--nlevels", dest="levels", type=int,
            default=3, help="""Number of levels in the contour""")

        #group running median
        group = self.add_argument_group("Running Median and clustering related")
        group.add_argument("-w", "--window-median", dest="w_median", type=int,
            help="""Length of the running median window (default 20001,
                recommended for bacteria).  For short genome (below 100000
                bases), we set this parameter to one fifth of the genome 
                length .""",
            default=20001)


        group.add_argument("-k", "--mixture-models", dest="k", type=int,
            help="""Number of mixture models to use (default 2, although if sequencing
        depth is below 8, k is set to 1 automatically). To ignore that behaviour
        set k to the required value""",
            default=2)

        group.add_argument("-L", "--low-threshold", dest="low_threshold",
            default=None, type=float,
            help=("lower threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-H", "--high-threshold", dest="high_threshold",
            default=None, type=float,
            help=("higher threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-T", "--threshold", dest="threshold",
            default=4, type=float,
            help="""set lower and higher thresholds of the confidence interval.""")
        group.add_argument("-C", "--clustering-parameter", dest="double_threshold",
            default=0.5, type=float,
            help="""set lower and higher double threshold parameter (in [0,1]).
Do not use value close to zero. Ideally, around 0.5. lower value will tend to
cluster more than higher value""")

        group = self.add_argument_group("Large data related - CNV detection")
        group.add_argument("-s", "--chunk-size", dest="chunksize", type=int,
            default=5000000, min=1000000, action=Min,
            help="""Length of the chunk to be used for the analysis. """)
        group.add_argument("-B", "--binning", dest="binning", type=int,
            default=None, min=2, action=Min,
            help="""merge consecutive (non overlapping) data points, taking the
mean. This is useful for large genome (e.g. human). This allows a faster
computation, especially for CNV detection were only large windows are of
interest. For instance, using a binning of 50 or 100 allows the human genome to
be analysed.""")
        group.add_argument("--cnv-clustering", dest="cnv_clustering",
            default=-1, type=int,
            help="""Two consecutive ROIs are merged when their distance in bases
is below this parameter. If set to -1, not used. """)

        # group facilities
        group = self.add_argument_group("Download reference")
        group.add_argument("--download-reference", dest="download_reference",
            default=None, type=str)
        group.add_argument("--download-genbank", dest="download_genbank",
            default=None, type=str)
        group.add_argument("--database", dest="database",
            default="ENA", type=str,
            choices=["ENA", "EUtils"],
            help="Download the reference from one of these database (default ENA)")
Example #11
0
    def  __init__(self, prog="sequana_coverage"):
        usage = purple("""\nWelcome to SEQUANA -- Coverage standalone

    Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM
    file. In addition, running median used in conjunction with double thresholds
    extract regions of interests (low or high coverage). A reference may be
    provided to plot the coverage versus GC content.

    The input file should be one of the following:

    - a BED file that is a tabulated file at least 3 columns.
      The first column being the reference, the second is the position
      and the third column contains the coverage itself.
    - or a BAM file that is converted automatically
      into a BED file using the following command:

        samtools depth -aa input.bam > output.bed

    If the reference is provided, an additional plot showing the coverage versus
    GC content is also shown.

    Here are some examples

        sequana_coverage --input file.bed --window-median 1001
        sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa>

    An other interesting option is to provide a BED file with 4 columns. The
    fourth column being another coverage data created with a filter. One can
    create such a file only from the BAM file using samtools as follows given
    the original unfiltered BAM file named input.bam:

        samtools view -q 35  -o data.filtered.bam input.bam
        samtools depth input.bam data.filtered.bam  -aa > test.bed
        sequana_coverage --input test.bed --show-html

    Note that the first file is the filtered one, and the second file is the
    unfiltered one.

    Note for multi chromosome and genbank features: for now, you will need to call
    sequana_coverage for each chromosome individually since we accept only one
    genbank as input parameter:

        sequana_coverage --input file.bed --genbank chrom1.gbk -c 1

    Large genomes:
    --------------

    If your input data is large and does not fit into memory, use the --binning BIN
    options to average data into bin of BIN values.

    CNV cases:
    --------------

    By default, sequana_coverage identify events as small as 1 bin. For the CNV
    detection case, you may want to cluster events. the --cnv-merging DELTA option
    merges consecutives events whose distance is smaller that DELTA


        """)

        epilog = purple("""
----

AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """)

        description = """DESCRIPTION:
        """

        super(Options, self).__init__(usage=usage, prog=prog,
                description=description, epilog=epilog,
                formatter_class=CustomFormatter)

        # options to fill the config file
        group = self.add_argument_group("Required argument")
        group.add_argument("-i", "--input", dest="input", type=str,
            help=("Input file in BED or BAM format. If a BAM file is "
                 "provided, it will be converted locally to a BED file "
                 "using genomecov, which must be installed."))

        group = self.add_argument_group("Optional biological arguments")
        group.add_argument(
            '-c', "--chromosome", dest="chromosome", type=int, default=-1,
            help="Chromosome number (if only one chromosome found, the single"
                 " chromosome is chosen automatically). Otherwise all "
                 "chromosomes are analysed. You may want to analyse only one"
                 " in which case, use this parameter (e.g., -c 1). "
                "!!START AT INDEX 0 !!")
        group.add_argument('-o', "--circular", dest="circular",
            default=False, action="store_true",
            help="""If the DNA of the organism is circular (typically
            viruses or bacteria), set to True""")

        group = self.add_argument_group("General")
        group.add_argument("--output-directory", dest="output_directory",
            default="report", help="name of the output (report) directory.")
        group.add_argument("-q", "--quiet", dest="verbose",
            default=True, action="store_false")
        group.add_argument('--no-html', dest="skip_html",
            default=False, action='store_true',
            help="""Do not create any HTML reports. Save ROIs and statistics only.""")
        group.add_argument('--no-multiqc', dest="skip_multiqc",
            default=False, action='store_true',
            help="""Do not create any multiqc HTML page.""")
        group.add_argument("--debug-level", dest="logging_level",
            default="INFO",
            help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR")
        group.add_argument("--level", dest="logging_level",
            default="INFO",
            help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR")
        group = self.add_argument_group('Annotation')
        group.add_argument("-b", "--genbank", dest="genbank",
            type=str, default=None, help='a valid genbank annotation')

        # Group related to GC content
        group = self.add_argument_group("GC content related")
        group.add_argument('-r', "--reference", dest="reference", type=str,
            default=None,
            help="""If available, you can provide a reference (ENA/NCBI). It
                 must have the same length as the one used to create the
                 BAM or BED file. If provided, it is used to create the
                 coverage versus GC content image""")
        group.add_argument(
            "-g", "--window-gc", dest="w_gc", type=int, default=201,
            help="""Length of the running window to compute the GC content""")
        group.add_argument('-n', "--nlevels", dest="levels", type=int,
            default=3, help="""Number of levels in the contour""")

        #group running median
        group = self.add_argument_group("Running Median and clustering related")
        group.add_argument("-w", "--window-median", dest="w_median", type=int,
            help="""Length of the running median window (default 20001,
                recommended for bacteria).  For short genome (below 100000
                bases), we set this parameter to one fifth of the genome 
                length .""",
            default=20001)


        group.add_argument("-k", "--mixture-models", dest="k", type=int,
            help="""Number of mixture models to use (default 2, although if sequencing
        depth is below 8, k is set to 1 automatically). To ignore that behaviour
        set k to the required value""",
            default=2)

        group.add_argument("-L", "--low-threshold", dest="low_threshold",
            default=None, type=float,
            help=("lower threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-H", "--high-threshold", dest="high_threshold",
            default=None, type=float,
            help=("higher threshold (zscore) of the confidence interval. "
                "Overwrite value given by --threshold/-T"))
        group.add_argument("-T", "--threshold", dest="threshold",
            default=4, type=float,
            help="""set lower and higher thresholds of the confidence interval.""")
        group.add_argument("-C", "--clustering-parameter", dest="double_threshold",
            default=0.5, type=float,
            help="""set lower and higher double threshold parameter (in [0,1]).
Do not use value close to zero. Ideally, around 0.5. lower value will tend to
cluster more than higher value""")

        group = self.add_argument_group("Large data related - CNV detection")
        group.add_argument("-s", "--chunk-size", dest="chunksize", type=int,
            default=5000000, min=1000000, action=Min,
            help="""Length of the chunk to be used for the analysis. """)
        group.add_argument("-B", "--binning", dest="binning", type=int,
            default=None, min=2, action=Min,
            help="""merge consecutive (non overlapping) data points, taking the
mean. This is useful for large genome (e.g. human). This allows a faster
computation, especially for CNV detection were only large windows are of
interest. For instance, using a binning of 50 or 100 allows the human genome to
be analysed.""")
        group.add_argument("--cnv-clustering", dest="cnv_clustering",
            default=-1, type=int,
            help="""Two consecutive ROIs are merged when their distance in bases
is below this parameter. If set to -1, not used. """)

        # group facilities
        group = self.add_argument_group("Download reference")
        group.add_argument("--download-reference", dest="download_reference",
            default=None, type=str)
        group.add_argument("--download-genbank", dest="download_genbank",
            default=None, type=str)
        group.add_argument("--database", dest="database",
            default="ENA", type=str,
            choices=["ENA", "EUtils"],
            help="Download the reference from one of these database (default ENA)")
Example #12
0
from sequana.scripts.tools import SequanaOptions

from sequana import logger
from easydev.console import purple



class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
                      argparse.RawDescriptionHelpFormatter):
    pass


epilog = purple("""
----

AUTHORS: Thomas Cokelaer
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """)


class Options(argparse.ArgumentParser, SequanaOptions):
    def  __init__(self, prog="sequana_vcf_filter"):
        usage = """%s Only for VCF using mpileup version 4.1 for now\n""" % prog
        usage += """usage2: %s vcf_filter""" % prog
        usage += """Examples:

    sequana_vcf_filter --input test.vcf --quality 40
                --filter "AF1>0.95&AF1<0.05"
                --filter "MQ<30"
Example #13
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    from easydev.console import purple, underline
    if "-v" in args or "--verbosity" in args:
        print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)"))

    arg_parser = argparse.ArgumentParser(prog="bioconvert",
                                         epilog=" ----    ",
                                         description="""Convertor infer the
                                         formats from the extension name. We do
                                         not scan the input file. Therefore
                                         users must ensure that their input
                                         format files are properly
                                         formatted.""",
                                         usage="""
    # convert fastq to fasta
    bioconvert test.fastq test.fasta

    # if input extension is not standard, use -i to specify it
    bioconvert test.FASTQ test.fasta -i fastq

    bioconvert test.fastq -o fasta

    # You may have several inputs, in which case wildcards are possible
    # Note, however, the quotes that are required
    bioconvert "test*.fastq" -o fasta

    # batch is also possible. 
    bioconvert "test*.fastq" -o fasta -m 

    Note the difference between the two previous commands !!

""")
    arg_parser.add_argument("input_file",
                            default=None,
                            help="The path to the file to convert.")
    arg_parser.add_argument("output_file",
                            nargs="?",
                            default=None,
                            help="The path where the result will be stored.")

    arg_parser.add_argument("-f",
                            "--formats",
                            action=ConvAction,
                            default=False,
                            help="Display available formats and exit.")
    arg_parser.add_argument(
        "-v",
        "--verbosity",
        default="INFO",
        help=
        "Set the outpout verbosity. Should be one of DEBUG, INFO, WARNING, ERROR, CRITICAL"
    )
    arg_parser.add_argument(
        "-i",
        "--input-format",
        default=None,
        help=
        "Provide the input format. Check the --formats to see valid input name"
    )
    arg_parser.add_argument(
        "-o",
        "--output-format",
        default=None,
        help=
        "Provide the output format. Check the --formats to see valid input name"
    )
    arg_parser.add_argument(
        "-x",
        "--threads",
        default=None,
        type=int,
        help="Number of threads. Depends on the underlying tool")
    arg_parser.add_argument("-m",
                            "--batch",
                            default=False,
                            action="store_true",
                            help="for batch effect")

    arg_parser.add_argument("-c",
                            "--method",
                            default=None,
                            help="A converter may have several methods")

    arg_parser.add_argument(
        "-F",
        "--force",
        action="store_true",
        help="if outfile exists, it is overwritten with this option")

    arg_parser.add_argument("-s",
                            "--show-methods",
                            default=False,
                            action="store_true",
                            help="A converter may have several methods")

    arg_parser.add_argument("-b",
                            "--benchmark",
                            default=False,
                            action="store_true",
                            help="Running all available methods")

    arg_parser.add_argument("-N",
                            "--benchmark-N",
                            default=5,
                            type=int,
                            help="Number of trials for each methods")

    args = arg_parser.parse_args()

    # Set the logging level
    bioconvert.logger_set_level(args.verbosity)

    # Figure out whether we have several input files or not
    # Are we in batch mode ?
    import glob
    if args.batch:
        filenames = glob.glob(args.input_file)
    else:
        filenames = [args.input_file]

    for filename in filenames:
        args.input_file = filename
        analysis(args)
Example #14
0
 def purple(self, txt, force=False):
     if self.verbose or force is True:
         print(purple(txt))
Example #15
0
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
Example #16
0
def anova_pipeline(args=None):
    """This function is used by the standalone application called
    **gdsctools_anova**

    Type::

        gdsctools_anova --help

    to get some help.
    """
    msg = "Welcome to GDSCTools standalone"
    print_color(msg, purple, underline=True)

    # Keep the argument args as None by default to
    # allow testing e.g., in nosetests
    if args is None:
        args = sys.argv[:]
    elif len(args) == 1:
        args += ['--help']

    user_options = ANOVAOptions(prog="gdsctools_anova")
    try:
        options = user_options.parse_args(args[1:])
    except SystemExit:
        return

    # -----------------------------------------------------------------
    # ---------------------------------------- options without analysis
    # -----------------------------------------------------------------

    if options.version is True:
        print("This is version %s of gdsctools_anova" % gdsctools.version)
        return

    if options.testing is True:
        print('Testing mode:')
        from gdsctools import ANOVA, ic50_test
        an = ANOVA(ic50_test)
        df = an.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut')

        assert df.loc[1,'N_FEATURE_pos'] == 554, \
            "N_feature_pos must be equal to 554"
        print(df.T)
        print(darkgreen("\nGDSCTools seems to be installed properly"))
        return

    if options.save_settings:
        from gdsctools import ANOVA, ic50_test
        an = ANOVA(ic50_test)
        an.settings.to_json(options.save_settings)
        print('Save a default parameter set in %s' % options.save_settings)
        return

    if options.license is True:
        print(gdsctools.license)
        return

    if options.summary is True:
        from gdsctools import anova
        an = anova.ANOVA(options.input_ic50, options.input_features)
        print(an)
        return

    if options.print_tissues is True:
        from gdsctools import anova
        an = anova.ANOVA(options.input_ic50, options.input_features)

        tissues = an.tissue_factor
        try:
            tissues = tissues.sort_values('Tissue Factor').unique()
        except:
            tissues = tissues.sort(inplace=False).unique()
        for name in tissues:
            print(name)
        return

    if options.print_drugs is True:
        from gdsctools import anova
        gdsc = anova.ANOVA(options.input_ic50, options.input_features)
        import textwrap
        print("\n".join(textwrap.wrap(" , ".join(gdsc.drugIds))))
        return

    if options.print_features is True:
        from gdsctools import anova
        gdsc = anova.ANOVA(options.input_ic50, options.input_features)
        import textwrap
        print("\n".join(textwrap.wrap(" , ".join(gdsc.feature_names))))
        return

    # -----------------------------------------------------------------
    # --------------------------------------------------- real analysis
    # -----------------------------------------------------------------
    # dispatcher to the functions according to the user parameters

    from gdsctools import ANOVA, ANOVAReport
    anova = ANOVA(options.input_ic50,
                  options.input_features,
                  options.input_drug,
                  low_memory=not options.fast)
    anova = _set_settings(anova, options)

    if options.drug and options.drug not in anova.ic50.df.columns:
        print(red("Invalid Drug. Try --print-drug-names"))
        sys.exit(1)

    if options.drug is not None and options.feature is not None:
        print_color("ODOF mode", purple)
        anova_one_drug_one_feature(anova, options)
    elif options.drug is not None:
        print_color("ODAF mode", purple)
        anova_one_drug(anova, options)
    else:  # analyse everything
        if options.feature is None:
            print_color("ADAF mode", purple)
        else:
            print_color("ADOF mode", purple)
        anova_all(anova, options)

    if options.onweb is False and options.no_html is False:
        msg = "\nNote that a directory {} was created and files saved into it"
        print(purple(msg.format(options.directory)))

    return
Example #17
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    from easydev.console import purple, underline
    print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)"))

    arg_parser = argparse.ArgumentParser(prog="bioconvert",
                                         epilog=" ----    ",
                                         description="""DESCRIPTION:

Convertor infer the formats from the extension name. We do not scan the
input file. Therefore users must ensure that their input format files are
properly formatted.

""")
    arg_parser.add_argument("input_file", help="The path to the file to convert.")
    arg_parser.add_argument("output_file", help="The path where the result will be stored.")

    arg_parser.add_argument("-f", "--formats",
                            action=ConvAction,
                            default=False,
                            help="Display available formats and exit.")
    arg_parser.add_argument("-v", "--verbosity",
                            action="count",
                            default=0,
                            help="Set the outpout verbosity.")
    arg_parser.add_argument("-x", "--input-format",
                            default=None,
                            help="Provide the input format. Check the --formats to see valid input name")

    args = arg_parser.parse_args()
    # Set the logging level
    args.verbosity = max(10, 30 - (10 * args.verbosity))
    bioconvert.logger_set_level(args.verbosity)
    _log = colorlog.getLogger('bioconvert')

    mapper = Registry()

    infile = args.input_file
    outfile = args.output_file

    # Users may provide information about the input file.
    # Indeed, the input may be a FastQ file but with an extension
    # that is not standard. For instance fq instead of fastq
    # If so, we can use the --input-format fastq to overwrite the
    # provided filename extension
    inext = os.path.splitext(infile)[-1]
    outext = os.path.splitext(outfile)[-1]

    if args.input_format:
        inext = args.input_format
        if not inext.startswith("."):
            inext = "." + inext

    if not inext:
        raise RuntimeError("convert infer the format from the extension name."
                           " So add extension to the input file name or use --input-format option.")

    if not outext:
        raise RuntimeError("convert infer the format from the extension name."
                           " So add extension to the output file name.")

    # From the input parameters 1 and 2, we get the module name
    try:
        _log.info("Input: {}".format(inext))
        _log.info("Output: {}".format(outext))
        class_converter = mapper[(inext, outext)]
    except KeyError:
        print(mapper)
        print(inext)
        print(outext)

        # Is the module name available in biokit ? If not, let us tell the user
        msg = "Request input format ({}) to output format (({}) is not available in converters"
        _log.critical(msg.format(inext, outext))
        _log.critical("Use --formats to know the available formats")
        sys.exit(1)

    # If the module exists, it is part of the MapperRegitry dictionary and
    # we should be able to import it dynamically, create the class and call
    # the instance
    _log.info("Converting from {} to {}".format(inext, outext))
    convert = class_converter(infile, outfile)
    convert()
    _log.info("Done")
Example #18
0
def main(args=None):
    from easydev.console import purple, underline
    print(purple("Welcome to biokit converter (biokit.readthedocs.io)"))
    mapper = MapperRegistry()

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="converter")


    # If --help or no options provided, show the help
    if "-f" in args or "--formats" in args:
        options = user_options.parse_args(args[1:])
        if options.format:
            print("Available mapping:")
            print("==================")
            for k in sorted(mapper):
                print("{}: {}".format(k, mapper[k]))
            sys.exit(0)

    if len(args) < 3:
        user_options.parse_args(["prog", "--help"])
    else:
        infile = args[1]
        outfile = args[2]
        options = user_options.parse_args(args[3:])

    # Set the logging level
    biokit_debug_level(options.logging_level)

    # Users may provide information about the input file.
    # Indeed, the input may be a FastQ file but with an extension
    # that is not standard. For instance fq instead of fastq
    # If so, we can use the --input-format fastq to overwrite the
    # provided filename extension
    inext = os.path.splitext(infile)[-1][1:]
    outext = os.path.splitext(outfile)[-1][1:]

    if options.input_format:
        inext = options.input_format

    # From the input parameters 1 and 2, we get the module name
    module_name = "2".join([inext, outext]) 

    # Is the module name available in biokit ? If not, let us tell the user 
    if module_name not in mapper.keys():
        msg = "Request input format ({}) to output format (({}) is not available in converters"
        logger.critical(msg.format(inext, outext))
        logger.critical("Use --formats to know the available formats")
        sys.exit(1)

    # If the module exists, it is part of the MapperRegitry dictionary and
    # we should be able to import it dynamically, create the class and call
    # the instance
    logger.info("Converting from {} to {}".format(inext, outext))
    module = importlib.import_module("biokit.converters.{}".format(module_name))
    class_reference = getattr(module, mapper[module_name])
    convert = class_reference(infile, outfile)
    convert()
    logger.info("Done")
Example #19
0
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
Example #20
0
def anova_pipeline(args=None):
    """This function is used by the standalone application called
    **gdsctools_anova**

    Type::

        gdsctools_anova --help

    to get some help.
    """
    msg = "Welcome to GDSCTools standalone"
    print_color(msg, purple, underline=True)

    # Keep the argument args as None by default to
    # allow testing e.g., in nosetests
    if args is None:
        args = sys.argv[:]
    elif len(args) == 1:
        args += ['--help']

    user_options = ANOVAOptions(prog="gdsctools_anova")
    try:
        options = user_options.parse_args(args[1:])
    except SystemExit:
        return

    # -----------------------------------------------------------------
    # ---------------------------------------- options without analysis
    # -----------------------------------------------------------------

    if options.version is True:
        print("This is version %s of gdsctools_anova" % gdsctools.version)
        return

    if options.testing is True:
        print('Testing mode:')
        from gdsctools import ANOVA, ic50_test
        an = ANOVA(ic50_test)
        df = an.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut')

        assert df.loc[1,'N_FEATURE_pos'] == 554, \
            "N_feature_pos must be equal to 554"
        print(df.T)
        print(darkgreen("\nGDSCTools seems to be installed properly"))
        return

    if options.save_settings:
        from gdsctools import ANOVA, ic50_test
        an = ANOVA(ic50_test)
        an.settings.to_json(options.save_settings)
        print('Save a default parameter set in %s' % options.save_settings)
        return 

    if options.license is True:
        print(gdsctools.license)
        return

    if options.summary is True:
        from gdsctools import anova
        an = anova.ANOVA(options.input_ic50, options.input_features)
        print(an)
        return


    if options.print_tissues is True:
        from gdsctools import anova
        an = anova.ANOVA(options.input_ic50, options.input_features)

        tissues = an.tissue_factor
        try:
            tissues = tissues.sort_values('Tissue Factor').unique()
        except:
            tissues = tissues.sort(inplace=False).unique()
        for name in tissues:
            print(name)
        return

    if options.print_drugs is True:
        from gdsctools import anova
        gdsc = anova.ANOVA(options.input_ic50, options.input_features)
        import textwrap
        print("\n".join(textwrap.wrap(" , ".join(gdsc.drugIds))))
        return

    if options.print_features is True:
        from gdsctools import anova
        gdsc = anova.ANOVA(options.input_ic50, options.input_features)
        import textwrap
        print("\n".join(textwrap.wrap(" , ".join(gdsc.feature_names))))
        return

    # -----------------------------------------------------------------
    # --------------------------------------------------- real analysis
    # -----------------------------------------------------------------
    # dispatcher to the functions according to the user parameters


    from gdsctools import ANOVA, ANOVAReport
    anova = ANOVA(options.input_ic50, options.input_features,
            options.input_drug,
            low_memory=not options.fast)
    anova = _set_settings(anova, options)


    if options.drug and options.drug not in anova.ic50.df.columns:
        print(red("Invalid Drug. Try --print-drug-names"))
        sys.exit(1)


    if options.drug is not None and options.feature is not None:
        print_color("ODOF mode", purple)
        anova_one_drug_one_feature(anova, options)
    elif options.drug is not None:
        print_color("ODAF mode", purple)
        anova_one_drug(anova, options)
    else: # analyse everything
        if options.feature is None:
            print_color("ADAF mode", purple)
        else:
            print_color("ADOF mode", purple)
        anova_all(anova, options)

    if options.onweb is False and options.no_html is False:
        msg = "\nNote that a directory {} was created and files saved into it"
        print(purple(msg.format(options.directory)))

    return
Example #21
0
        verbo_nb = sum([1 for opt in sys.argv if opt.startswith('--verb')])
        verbosity = v_nb + verbo_nb

        bioconvert.logger_set_level(max(10, 30 - (10 * verbosity)))

        mapper = Registry()
        print("Available mapping:")
        print("==================")
        for k in sorted(mapper.get_conversions()):
            print("{} -> {}".format(k[0], k[1]))
        sys.exit(0)


if __name__ == "__main__":
    from easydev.console import purple, underline
    print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)"))

    arg_parser = argparse.ArgumentParser(prog="converter",
                                         epilog=" ----    ",
                                         description="""DESCRIPTION:

Convertor infer the formats from the extension name. We do not scan the
input file. Therefore users must ensure that their input format files are
properly formatted.

""")
    arg_parser.add_argument("input_file",
                            help="The path to the file to convert.")
    arg_parser.add_argument("output_file",
                            help="The path where the result will be stored.")
    def run(self, color=True):
        """Executes 'python setup.py' with the user commands on all packages. """
        if color:
            try:
                from easydev.console import bold, red, green, \
                    color_terminal, nocolor, underline, purple
            except:
                try:
                    sys.path.insert(0, os.path.join('deploy', 'src', 'deploy'))
                    from console import bold, red, green, \
                        color_terminal, nocolor, underline, purple
                except:
                    pass
            if not color_terminal():
                # Windows' poor cmd box doesn't understand ANSI sequences
                nocolor()
        else:
            bold = purple = red = green = underline = str

        print(bold("Running multisetup version %s" % __revision__.split()[2]))

        #project_dir = self.curdir.basename()
        directories = [package for package in self.packages]

        print('Will process the following directories: ', )
        for directory in directories:
            print(bold(directory)),
            #print bold(directory.basename()),
        print('')

        try:
            for directory in directories:
                try:
                    os.chdir(directory)
                    print(
                        underline('Entering %s package' %
                                  os.path.basename(directory)))
                    #          % directory.basename())
                except OSError as err:
                    print(
                        underline('Entering %s package' %
                                  os.path.basename(directory)))
                    print(
                        red("cannot find this directory (%s)" %
                            os.path.basename(directory)))
                    print(err)

                print('Python exec : ', sys.executable)

                #print underline('Entering %s package' % directory.basename())
                for cmd in self.commands:
                    setup_command = '%s setup.py %s ' % (sys.executable, cmd)
                    print("\tExecuting " + setup_command + '...processing', )

                    #Run setup.py with user commands
                    outputs = None
                    errors = None
                    if self.verbose:
                        process = Popen(setup_command, shell=True)
                        status = process.wait()
                    else:
                        process = Popen(setup_command,
                                        stdout=PIPE,
                                        stderr=PIPE,
                                        shell=True)
                        #status = process.wait()
                        outputs, errors = process.communicate()
                    if process.returncode == 0:
                        print(green('done'))
                    else:
                        if not self.verbose:
                            print(
                                red('\tFailed. ( error code %s) ' %
                                    (process.returncode)))
                            os.chdir(self.curdir)

                        if not self.force:
                            raise RuntimeError()

                    if 'pylint' in cmd:
                        if outputs is not None:
                            for x in outputs.split('\n'):
                                if x.startswith('Your code has been'):
                                    print(purple('\t%s' % x))
                    if 'nosetests' in cmd:
                        if errors is not None:
                            for x in errors.split('\n'):
                                if x.startswith('TOTAL'):
                                    res = x.replace('TOTAL', 'Total coverage')
                                    res = " ".join(res.split())
                                    print(purple('\t%s' % res))
                                if x.startswith('Ran'):
                                    print(purple('\t%s' % x))
                                if x.startswith('FAILED'):
                                    print(purple('\t%s' % x))
                        else:
                            print(purple('all right'))

                os.chdir(self.curdir)

        except RuntimeError:
            sys.exit()

        os.chdir(self.curdir)
Example #23
0
from sequana import FastQ
from sequana import logger

import colorlog
logger = colorlob.getLogger(__name__)


class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
                      argparse.RawDescriptionHelpFormatter):
    pass


epilog = purple("""
----

AUTHORS: Thomas Cokelaer
Documentation: http://sequana.readthedocs.io
Issues: http://github.com/sequana/sequana
        """)


class Options(argparse.ArgumentParser, SequanaOptions):
    def __init__(self, prog="sequana_substractor"):
        usage = """%s reads (flag 256+4) saving the mapped reads in a file, and the unmapped in
another file\n""" % prog
        usage += """usage2: %s --input test.fastq --reference Phix174.fa\n""" % prog
        usage += """

        """
        super(Options, self).__init__(usage=usage,
                                      prog=prog,
Example #24
0
def sequana_init(options):
    import sequana
    from sequana.misc import textwrap
    from sequana import SequanaConfig, sequana_data
    sa = Tools(verbose=options.verbose)

    # Check that the pipeline is well defined
    module = Module(options.pipeline)

    if os.path.exists(options.target_dir):
        txt = "Will override the following files if present: %s.rules " +\
              "config.yaml, runme.sh, ..."
        sa.blue(txt % options.pipeline)

        if options.force is True:
            choice = "y"
        else:
            choice = input(
                red("Do you want to proceed (to avoid this " +
                    " message, use --force)? [y]/n:"))
        if choice == "n":
            sys.exit(0)

    # Copying snakefile
    logger.info("Copying snakefile")
    sa.mkdir(options.target_dir)
    shutil.copy(module.snakefile,
                options.target_dir + os.sep + options.pipeline + ".rules")

    # Creating README to print on the screen and in a file
    txt = "User command::\n\n"
    txt += "    %s \n\n" % " ".join(sys.argv)
    txt += "You can now run snakemake yourself or type::"
    txt += purple("""

    snakemake -s %s.rules --stats stats.txt -p -j 4

    """ % options.pipeline)
    txt += """
    # -j 4 means you will use 4 cores
    # -p prints the commands used
    # --stats stats.txt must be used since stats.txt is expected to be found.

    or just run the bash script::

        sh runme.sh

    EDIT THE config.yaml if needed

    Once finished with success, the report/ directory contains a summary.html
    and relevant files (depends on the pipeline).
    """
    logger.info("Creating README")
    with open(options.target_dir + os.sep + "README", "w") as fh:
        fh.write(txt.replace("\x1b[35m", "").replace("\x1b[39;49;00m", ""))

    # Creating Config file
    logger.info("Creating the config file")

    # Create (if needed) and update the config file
    config_filename = options.target_dir + os.sep + "config.yaml"

    if options.config:
        # full existing path
        if os.path.exists(options.config):
            shutil.copy(options.config, config_filename)
        else:  # or a sequana config file in the module path ?
            raise (IOError("Config file %s not found locally" %
                           options.config))
    else:
        copy_config_from_sequana(module, "config.yaml", config_filename)

    # Copy multiqc if it is available
    multiqc_filename = options.target_dir + os.sep + "multiqc_config.yaml"
    copy_config_from_sequana(module, "multiqc_config.yaml", multiqc_filename)
    cluster_cfg_filename = options.target_dir + os.sep + "cluster_config.json"
    copy_config_from_sequana(module, "cluster_config.json",
                             cluster_cfg_filename)

    # The input
    cfg = SequanaConfig(config_filename)
    cfg.config.input_directory = options.input_directory
    cfg.config.input_pattern = options.pattern
    cfg.config.input_extension = options.extension
    cfg.config.input_samples.file1 = options.file1
    cfg.config.input_samples.file2 = options.file2
    cfg.config.input_readtag = options.input_readtag

    # Dedicated section for quality control section
    if options.pipeline == "quality_control":
        if options.design:
            shutil.copy(options.design, options.target_dir + os.sep)
            cfg.config['cutadapt'].design_file = os.path.basename(
                options.design)

        if options.kraken:
            cfg.config.kraken.database_directory = os.path.abspath(
                options.kraken)
            cfg.config.kraken.do = True
        else:
            cfg.config.kraken.do = False

        cfg.config['cutadapt'].fwd = options.adapter_fwd
        cfg.config['cutadapt'].rev = options.adapter_rev
        cfg.config['cutadapt'].adapter_type = options.adapters
        # Foir all pipeline using BWA
        if options.reference:
            cfg.config.bwa_mem.reference = os.path.abspath(options.reference)
    if options.pipeline == "variant_calling":
        if options.reference:
            cfg.config.bwa_mem_ref.reference = os.path.abspath(
                options.reference)

    if options.pipeline in ["rnaseq", "smallrnaseq"]:
        if options.design:
            shutil.copy(options.design, options.target_dir + os.sep)
            cfg.config['cutadapt'].design_file = os.path.basename(
                options.design)
        cfg.config['cutadapt'].fwd = options.adapter_fwd
        cfg.config['cutadapt'].rev = options.adapter_rev
        cfg.config['cutadapt'].adapter_choice = options.adapters

    cfg.copy_requirements(target=options.target_dir)

    # FIXME If invalid, no error raised
    if options.config_params:
        params = [this.strip() for this in options.config_params.split(",")]
        for param in params:
            if param.count(":") not in [1, 2, 3]:
                txt = "incorrect format following --config-params"
                txt += "Expected at least one : sign or at most 2 of them"
                txt += "Config file section such as :\n"
                txt += "project: tutorial\n"
                txt += "should be encoded project:tutorial"
                raise ValueError(txt)
            if param.count(":") == 1:
                k, v = param.split(':')
                cfg.config[k] = v
            elif param.count(":") == 2:
                k1, k2, v = param.split(":")
                cfg.config[k1][k2] = v
            elif param.count(":") == 3:
                k1, k2, k3, v = param.split(":")
                cfg.config[k1][k2][k3] = v

    # important to update yaml with content of config
    cfg._update_yaml()
    cfg.save(config_filename)

    # Creating a unique runme.sh file
    runme_filename = options.target_dir + os.sep + "runme.sh"
    with open(runme_filename, "w") as fout:
        cmd = "#!/bin/sh\n"
        cmd += "# generated with sequana version %s with this command:\n" % sequana.version
        cmd += "# %s\n" % " ".join(sys.argv)
        cmd += "snakemake -s %(project)s.rules --stats stats.txt -p -j %(jobs)s --nolock"
        if options.forceall:
            cmd += " --forceall "

        if options.cluster:
            # Do we want to include the cluster config option ?
            cluster_config = Module(options.pipeline).cluster_config
            if options.ignore_cluster_config is True:
                cluster_config = None

            if cluster_config is None:
                cmd += ' --cluster "%s"' % options.cluster
            else:
                cmd += ' --cluster "%s"  --cluster-config %s' %\
                    (options.cluster, os.path.basename(cluster_config))

        if options.redirection:
            cmd += " 1>run.out 2>run.err"
        fout.write(
            cmd % {
                'project': options.pipeline,
                'jobs': options.jobs,
                "version": sequana.version
            })
    # change permission of runme.sh to 755
    st = os.stat(runme_filename)
    os.chmod(runme_filename, st.st_mode | 0o755)

    sa.green("Initialisation of %s succeeded" % options.target_dir)
    sa.green("Please, go to the project directory ")
    sa.purple("\n   cd %s\n" % options.target_dir)
    sa.green("Check out the README and config.yaml files")
    sa.green("A basic script to run the analysis is named runme.sh ")
    sa.purple("\n    sh runme.sh\n")
    sa.purple("On a slurm cluster, you may type:")
    sa.purple("\n  srun --qos normal runme.sh\n")
    sa.green(
        "In case of trouble, please post an issue on https://github.com/sequana/sequana/issue "
    )
    sa.green(
        "or type sequana --issue and fill a post with the error and the config file (NO DATA PLEASE)"
    )

    # Change permission
    try:  #python 3
        os.chmod(runme_filename, 0o755)
    except:
        logger.info(
            "Please use Python3. Change the mode of %s manually to 755" %
            runme_filename)
Example #25
0
    def run(self, color=True):
        """Executes 'python setup.py' with the user commands on all packages. """
        if color:
            try:
                from easydev.console import bold, red, green, \
                    color_terminal, nocolor, underline, purple
            except:
                try:
                    sys.path.insert(0, os.path.join('deploy', 'src',  'deploy'))
                    from console import bold, red, green, \
                        color_terminal, nocolor, underline, purple
                except:
                    pass
            if not color_terminal():
                # Windows' poor cmd box doesn't understand ANSI sequences
                nocolor()
        else:
            bold = purple = red = green = underline = str

        print(bold("Running multisetup version %s" % __revision__.split()[2]))

        #project_dir = self.curdir.basename()
        directories = [package for package in self.packages]


        print('Will process the following directories: ',)
        for directory in directories:
            print(bold(directory)),
            #print bold(directory.basename()),
        print('')


        try:
            for directory in directories:
                try:
                    os.chdir(directory)
                    print(underline('Entering %s package'
                          % os.path.basename(directory)))
                          #          % directory.basename())
                except OSError as err:
                    print(underline('Entering %s package'
                              % os.path.basename(directory)))
                    print(red("cannot find this directory (%s)"
                              % os.path.basename(directory)))
                    print(err)

                print('Python exec : ' , sys.executable)

                #print underline('Entering %s package' % directory.basename())
                for cmd in self.commands:
                    setup_command = '%s setup.py %s ' % (sys.executable,cmd)
                    print("\tExecuting " + setup_command + '...processing',)


                    #Run setup.py with user commands
                    outputs = None
                    errors = None
                    if self.verbose:
                        process = Popen(setup_command,
                                        shell=True)
                        status = process.wait()
                    else:
                        process = Popen(setup_command, stdout=PIPE, stderr=PIPE,
                                        shell=True)
                        #status = process.wait()
                        outputs, errors = process.communicate()
                    if process.returncode == 0:
                        print(green('done'))
                    else:
                        if not self.verbose:
                            print(red('\tFailed. ( error code %s) ' %
                                  (process.returncode)))
                            os.chdir(self.curdir)

                        if not self.force:
                            raise RuntimeError()

                    if 'pylint' in cmd:
                        if outputs is not None:
                            for x in outputs.split('\n'):
                                if x.startswith('Your code has been'):
                                    print(purple('\t%s' % x))
                    if 'nosetests' in cmd:
                        if errors is not None:
                            for x in errors.split('\n'):
                                if x.startswith('TOTAL'):
                                    res = x.replace('TOTAL', 'Total coverage')
                                    res = " ".join (res.split())
                                    print(purple('\t%s' % res))
                                if x.startswith('Ran'):
                                    print(purple('\t%s' % x))
                                if x.startswith('FAILED'):
                                    print(purple('\t%s' % x))
                        else:
                            print(purple('all right'))

                os.chdir(self.curdir)

        except RuntimeError:
            sys.exit()

        os.chdir(self.curdir)