Ejemplo n.º 1
0
def run_kraken_taxon():

    def download():
        kd = KrakenDownload()
        kd.download('toydb')
    database = sequana_config_path + os.sep + "kraken_toydb"
    if os.path.exists(database) is False:
        download()

    file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data")
    file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz", "data")

    kt = KrakenAnalysis([file1, file2], database=database)
    kt.run()

    kt = KrakenAnalysis(file2, database=database)
    kt.run()

    p = tempfile.TemporaryDirectory()

    kt = KrakenHierarchical([file1, file2], [database, database],
            output_directory=p.name, force=True)
    kt.run()

    kt = KrakenHierarchical(file1, [database, database],
output_directory=p.name, force=True)
    kt.run()

    p.cleanup()
Ejemplo n.º 2
0
def test_snpeff():
    # a custom refrence
    fh_log = TempFile()

    mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk"), log=fh_log.name)
    with TempFile() as fh:
        mydata.launch_snpeff(sequana_data("JB409847.vcf"), fh.name)
    fh_log.delete()

    # cleanup
    try:
        os.remove("snpEff.config")
    except:
        pass

    try:
        os.remove("snpEff_genes.txt")
    except:
        pass

    try:
        os.remove("snpEff_summary.html")
    except:
        pass

    try:
        snpeff.SnpEff(reference="dummy")
        assert False
    except SystemExit:
        assert True
    except:
        assert False
Ejemplo n.º 3
0
def test_pacbio_input_bam(tmpdir):
    # we need a summary and a bunch of images
    filename = sequana_data("summary_pacbio_qc1.json")

    # mock the PNG files found in the summary
    import json
    summary = json.load(open(filename))
    pngname = sequana_data("no_data.jpg")
    summary["images"]["gc_vs_length"] = pngname
    summary["images"]["hist_gc_content"] = pngname
    summary["images"]["hist_read_length"] = pngname
    summary["images"]["hist_snr"] = pngname
    summary["images"]["hist_zmw"] = pngname

    summary_file = TempFile()
    with open(summary_file.name, "w") as ff:
        json.dump(summary, ff)

    # Now that we have this new summary file, let us use it
    # we also need an output handler
    ff = TempFile()

    from sequana.utils import config
    config.output_dir = "/tmp"
    #here, ff.name is of the form /tmp/djhfjh4dz so we need to remove the /tmp
    pacbio_input_bam.PacbioInputBAMModule(summary_file.name, ff.name.split("/")[1])

    # cleanup
    summary_file.delete()
    ff.delete()
Ejemplo n.º 4
0
    def __init__(self, wk=None):
        super(VariantCallingPipeline, self).__init__(wk=wk)
        # Define the data
        data = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")
        input_directory = os.path.dirname(data)
        self.input_pattern = input_directory + "/Hm*gz"
        self.pipeline = "variant_calling"

        # Define the project and config file
        subprocess.check_call([
            "sequana", "--pipeline", self.pipeline,
            "--input-pattern", '%s' % self.input_pattern,
            "--working-directory", self.wk, "--force"
            ])


        cmd = ["sequana", "--pipeline", self.pipeline,
             "--input-pattern", '%s'% self.input_pattern,
             "--working-directory", self.wk, "--force"]

        if "TRAVIS_PYTHON_VERSION" in os.environ:
             cmd += ["--snakemake-jobs", "1"]

        subprocess.check_call(cmd)


        # Add reference in the config
        cfg = SequanaConfig(self.wk + "/config.yaml")
        # We added a TTTT in position 5881
        cfg._yaml_code['bwa_mem_ref']['reference'] = sequana_data("measles.fa")
        cfg.save(self.wk + '/config.yaml')
Ejemplo n.º 5
0
def test_sequana_data():

    try:
        sequana_data()
        assert False
    except ValueError:
        assert True
Ejemplo n.º 6
0
def test_add_locus_with_modification():

    # Alter the original GBK to alter the locus name
    data = open(sequana_data("JB409847.gbk"), "r").read()
    newdata = data.replace("JB409847", "DUMMY_JB409847")

    fh = TempFile(suffix="gbk")
    with open(fh.name, 'w') as fout:
        fout.write(newdata)

    # Now we read this new GBK file that has a different locus name as
    # compared to the fasta
    mydata = snpeff.SnpEff(reference=fh.name)

    # Here is the corresponding FASTA
    fasta = sequana_data("JB409847.fasta")

    with TempFile(suffix="fasta") as fh2:
        mydata.add_locus_in_fasta(fasta, fh2.name)

        # In theory, in the newly created fasta file, we should find back the
        # DUMMY tag
        # cleanup
        try:
            os.remove("snpEff.config")
        except:
            pass

        data = open(fh2.name, "r").read()
        assert "DUMMY" in data
    fh.delete()
Ejemplo n.º 7
0
def test_is_sam_bam():

    datatest = sequana_data("test_measles.sam", "testing")
    assert is_sam(datatest) is True

    datatest = sequana_data("test_measles.bam", "testing")
    assert is_bam(datatest) is True
Ejemplo n.º 8
0
def test_input():
    filename = sequana_data('virus.bed', 'data')
    df = summary.main([prog, '--file', filename])
    len(df)

    filename = sequana_data('test.fastq', "testing")
    df = summary.main([prog, '--file', filename])
Ejemplo n.º 9
0
def test_ChromosomeCovMultiChunk():
    filename = sequana_data('JB409847.bed')
    # using chunksize of 7000, we test odd number
    bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'),chunksize=7000)
    chrom = bed.chr_list[0]
    res = chrom.run(501, k=2, circular=True)
    res.get_summary()
    res.get_rois()
Ejemplo n.º 10
0
def test_pcrfree():
    design = sequana_data("test_index_mapper.csv")

    try:
        ad = FindAdaptersFromDesign(design, "error")
        assert False
    except Exception:
        assert True

    # Other input from PCRFree
    ad = FindAdaptersFromDesign(design, "PCRFree")

    # Test the index1/2_seq with 3 cases
    # index1 present only,
    # no index at all (None)
    # index1 and index2 present
    design1 = sequana_data("test_expdesign_hiseq.csv")
    ad1 = FindAdaptersFromDesign(design1, "PCRFree")
    ad1.check()
    res1 = ad1.get_adapters_from_sample("553-iH2-1")
    res2 = ad1.get_adapters_from_sample("539-st2")
    res3 = ad1.get_adapters_from_sample("107-st2")

    assert res1['index1']['fwd'].identifier == "NextFlex_PCR_Free_adapter8|name:8|seq:TTAGGC"
    assert res1['index1']['fwd'].name == "8"
    assert res1['index1']['rev'].name == "8"

    assert list(res2.keys()) == ["universal"]

    assert res3['index1']['fwd'].name == "9"
    assert res3['index1']['rev'].name == "9"
    assert res3['index2']['fwd'].name == "10"
    assert res3['index2']['rev'].name == "10"

    # double indexing
    # This is a double indexing for PCRFree, which has not been tested
    # since it requires 16S adapters not yet in sequana
    """design2 = sequana_data("test_expdesign_miseq_illumina2.csv")
    ad2 = FindAdaptersFromDesign(design2, "PCRFree")
    assert ad2.get_adapters_from_sample('M2')['index1']['fwd'].identifier == \
            'NextFlex_PCR_Free_adapter2|name:2|seq:TGACCA'
    assert ad2.get_adapters_from_sample('M2')['index2']['fwd'].identifier == \
            'NextFlex_PCR_Free_adapter13|name:13|seq:AGTCAA'
    """


    design = sequana_data("test_expdesign_miseq_illumina.csv")
    ad = FindAdaptersFromDesign(design, "PCRFree")
    res = ad.get_adapters_from_sample("CR81-L1236-P1")
    assert res['index1']['fwd'].identifier == 'NextFlex_PCR_Free_adapter1|name:1|seq:CGATGT'



    design1 = sequana_data("test_expdesign_miseq_illumina_1.csv")
    ad = FindAdaptersFromDesign(design1, "PCRFree")
    ad.check() # all sample names must be found
    res = ad.get_adapters_from_sample("CM-2685")['index1']['fwd']
    assert res.name == "3"
Ejemplo n.º 11
0
def test_atropos_paired(tmpdir):
    # This is for the new version of cutadapt with version 1.1
    directory = tmpdir.mkdir('test_module')
    config.output_dir = str(directory)
    config.sample_name = 'JB409847'
    c = CutadaptModule(sequana_data('test_atropos_pe.txt'), "TEST", "test.html")
    assert c.jinja['mode'] == 'Paired-end'
    c = CutadaptModule(sequana_data('test_atropos_se.txt'), "TEST", "test.html")
    assert c.jinja['mode'] == 'Singled-end'
Ejemplo n.º 12
0
def test_sequana_data_star():
    # all files in a specific directory (a list)
    f1 = sequana_data("*", "images")
    assert isinstance(f1, list)
    assert 'Institut_Pasteur.png' in f1

    # all files (return a dict)
    f1 = sequana_data("*")
    assert isinstance(f1, dict)
Ejemplo n.º 13
0
def test_krona_merger():

    k1 = KronaMerger(sequana_data("test_krona_k1.tsv"))
    k2 = KronaMerger(sequana_data("test_krona_k2.tsv"))
    k1 += k2

    with TempFile(suffix='.tsv') as fh:
        df = k1.to_tsv(fh.name)
    assert all(df['count'] == [14043,591,184,132])
    assert k1['Bacteria\tProteobacteria\tspecies1\n'] == 14043
Ejemplo n.º 14
0
def test_bcf_filter():
    vcf_output_expected = sequana_data('JB409847.filter.vcf')
    bcf = BCF_freebayes(sequana_data('JB409847.bcf'))
    filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10,
                   'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3}
    filter_bcf = bcf.filter_bcf(filter_dict)
    with TempFile(suffix='.vcf') as fp:
        filter_bcf.to_vcf(fp.name)
        compare_file = filecmp.cmp(fp.name, vcf_output_expected)
        assert compare_file
Ejemplo n.º 15
0
def test_add_locus_no_modification():
    mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk"))
    with TempFile() as fh:
        fastafile = sequana_data("JB409847.fasta")
        mydata.add_locus_in_fasta(fastafile, fh.name)
        # cleanup
        try:
            os.remove("snpEff.config")
        except:
            pass
Ejemplo n.º 16
0
def test_sequana_data():

    try:
        sequana_data()
        assert False
    except ValueError:
        assert True
    except:
        assert False 

    sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data")
Ejemplo n.º 17
0
def test_coverage_module(tmpdir):

    bed = bedtools.GenomeCov(sequana_data("JB409847.bed"))
    fasta = sequana_data("JB409847.fasta")
    bed.compute_gc_content(fasta)
    c = bed.chr_list[0]
    c.run(4001)

    directory = tmpdir.mkdir('test_coverage_module')
    config.output_dir = str(directory)
    config.sample_name = "JB409847"
    CoverageModule(bed)
Ejemplo n.º 18
0
def test_adapter_reader():
    from sequana.adapters import AdapterReader as AR
    data = sequana_data("adapters_with_duplicates.fa", "testing")
    try:
        ar = AR(data)
        ar.sanity_check()
    except ValueError:
        pass

    data1 = sequana_data("adapters_Nextera_fwd.fa")
    data2 = sequana_data("adapters_Nextera_rev.fa")
    data3 = sequana_data("adapters_Nextera_revcomp.fa")

    # try different constructors
    ar1 = AR(data1)
    ar1.__repr__()
    ar1.index_sequences
    ar_same = AR(ar1._data)    # from a list of dictionaries
    assert ar1 == ar_same
    ar_same = AR(ar1)           # from a AR instance
    assert ar1 == ar_same
    assert ar1[0]['identifier'] == 'Universal_Adapter|name:universal'
    ar1.index_names
    assert ar1.get_adapter_by_sequence("XXX") is None
    try:
        ar1.get_adapter_by_identifier("XXX")
        assert False
    except ValueError:
        assert True

    # __eq__
    assert len(ar1) == 56

    # accessors
    ar1.sequences, ar1.identifiers, ar1.comments

    ar1.get_adapter_by_sequence("ACGT")
    assert ar1.get_adapter_by_index_name("dummy") is None
    assert ar1.get_adapter_by_identifier("Nextera_index_N517")

    ar2 = AR(data2)
    ar2.reverse()
    # fails due to S516 ????????
    assert ar1 == ar2

    ar3 = AR(data3)
    ar3.reverse_complement()
    assert ar1 == ar3

    # test to_fasta method
    with TempFile() as fh:
        ar1.to_fasta(fh.name)
Ejemplo n.º 19
0
def test_canvasjs_linegraph():
    bed = bedtools.GenomeCov(sequana_data("JB409847.bed"))
    fasta = sequana_data("JB409847.fasta")
    bed.compute_gc_content(fasta)


    c = bed.chr_list[0]
    c.run(4001)

    df = bed[0].df
    csv = df.to_csv(columns=['pos', 'cov', 'gc'], index=False,
                    float_format='%.3g')
    # create CanvasJS stuff
    cjs = CanvasJSLineGraph(csv, 'cov', 'pos', ['cov', 'gc'])
    # set options
    cjs.set_options({'zoomEnabled': 'true',
                     'zoomType': 'x',
                     'exportEnabled': 'true'})
    # set title
    cjs.set_title("Genome Coverage")
    # set legend
    cjs.set_legend({'verticalAlign': 'bottom',
                    'horizontalAlign': 'center',
                    'cursor':'pointer'},
                    hide_on_click=True)
    # set axis
    cjs.set_axis_x({'title': "Position (bp)",
                    'labelAngle': 30,
                    'minimum': 0,
                    'maximum': len(df)})
    cjs.set_axis_y({'title': "Coverage (Count)"})
    cjs.set_axis_y2({'title': "GC content (ratio)",
                     'minimum':0,
                     'maximum': 1,
                     'lineColor': '#FFC425',
                     'titleFontColor': '#FFC425',
                     'labelFontColor': '#FFC425'})
    # set datas
    cjs.set_data(index=0, data_dict={'type': 'line',
                                     'name': "Coverage",
                                     'showInLegend': 'true',
                                     'color': '#5BC0DE',
                                     'lineColor': '#5BC0DE'})
    cjs.set_data(index=1, data_dict={'type': 'line',
                                     'axisYType': 'secondary',
                                     'name': "GC content",
                                     'showInLegend': 'true',
                                     'color': '#FFC425',
                                     'lineColor': '#FFC425'})
    # create canvasJS
    cjs.create_canvasjs()
Ejemplo n.º 20
0
def test_analysis():
    file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")
    file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz")
    reference = sequana_data("measles.fa")

    from tempfile import TemporaryDirectory
    directory = TemporaryDirectory()
    shutil.copy(file1, directory.name)
    shutil.copy(file2, directory.name)
    shutil.copy(reference, directory.name)

    df = mapping.main([prog, 
            '--file1', file1,
            "--file2", file2, 
            "--reference", reference])
Ejemplo n.º 21
0
def test_design_constructor():
    filename = sequana_data("test_expdesign_miseq_illumina.csv")
    tt = ExpDesignMiSeq(filename)

    # using existing design
    tt = ExpDesignAdapter(tt)
    # or a filename
    tt = ExpDesignAdapter(filename)

    # constructor for hiseq
    tt = ExpDesignAdapter(sequana_data("test_expdesign_hiseq.csv"))

    tt = ExpDesignAdapter(sequana_data("test_expdesign_generic.csv"))
    tt
    print(tt)
Ejemplo n.º 22
0
 def save_config_file(self, filename):
     from sequana import sequana_data
     config_generic = sequana_data("config.ini", "busco")
     data = open(config_generic, "r").read()
     data = data.format(**self.params)
     with open(filename, "w") as fh:
         fh.write(data)
Ejemplo n.º 23
0
def test_duplicated_design():

    filename = sequana_data("test_expdesign_hiseq_duplicated_index.csv")
    ss = FindAdaptersFromDesign(filename, "Small")
    res = ss.get_adapters_from_sample("VB-22")
    assert res['index1']['fwd'].identifier == "Small_Adapter_5|name:small5|seq:ACAGTG"
    assert res['index1']['fwd'].sequence == "CAAGCAGAAGACGGCATACGAGATACAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
Ejemplo n.º 24
0
def test_pbsim():
    filename = sequana_data("test_pacbio_subreads.bam")
    ss = PBSim(filename, filename)
    with TempFile() as fh:
        ss.run(bins=100, step=50, output_filename=fh.name)
    from pylab import close
    close()
Ejemplo n.º 25
0
def test_to_csv():
    filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10,
                   'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3}
    bcf = BCF_freebayes(sequana_data('JB409847.bcf'))
    filter_bcf = bcf.filter_bcf(filter_dict)
    with TempFile(suffix='.csv') as ft:
        filter_bcf.to_csv(ft.name)
Ejemplo n.º 26
0
def test_file_name_factory():
    import glob

    def inner_test(ff):
        len(ff)
        print(ff)
        ff.filenames
        ff.realpaths
        ff.all_extensions
        ff.pathnames
        ff.extensions

    #list
    list_files = glob.glob("*.py")
    ff = snaketools.FileFactory(list_files)
    inner_test(ff)

    # glob
    ff = snaketools.FileFactory("*py")
    inner_test(ff)


    directory = os.path.dirname(sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz"))

    ff = snaketools.FastQFactory(directory + "/Hm2*fastq.gz", verbose=True)
    assert ff.tags == ['Hm2_GTGAAA_L005']

    ff.get_file1(ff.tags[0])
    ff.get_file2(ff.tags[0])
    assert len(ff) == 1
Ejemplo n.º 27
0
def test_cram():
    datatest = sequana_data("test_measles.cram", "testing")
    s = CRAM(datatest)
    assert s.summary == {'flags': {77: 6, 83: 14, 99: 10, 141: 6, 147: 10, 163: 14},
         'mapq': {0: 12, 60: 48},
         'mean_quality': 33.666171617161723,  
         'read_length': {79: 2, 81: 1, 93: 1, 101: 44}}
Ejemplo n.º 28
0
def test_sam(tmpdir):
    datatest = sequana_data("test.sam", "testing")
    s = SAM(datatest)
    assert len(s) == 432
    assert s.is_sorted is True
    assert s.is_paired is True
    df = s.get_df_concordance(max_align=100)
Ejemplo n.º 29
0
    def __init__(self, wk=None):
        super(RNASeqPipeline, self).__init__(wk)
        data = sequana_data("KO_ATCACG_R1_test.fastq.gz")
        input_directory = os.path.dirname(data)
        self.input_pattern = input_directory + "/KO_ATCACG_R1_test.fastq.gz"
        self.pipeline = "rnaseq"

        #self.output = self.wk + "/Hm2_GTGAAA_L005/report_qc_Hm2_GTGAAA_L005/summary.json"
        subprocess.check_call([
            "sequana", "--pipeline", self.pipeline,
            "--input-pattern", '%s'% self.input_pattern,
            "--working-directory", self.wk,
            "--adapter-fwd", 
            "GATCGGAAGAGCACACGTCTGAACTCCAGTCA", 
            "--adapter-rev", 
            "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC",
            "--force"])

        # Need to edit the config file
        cfg = SequanaConfig(self.wk + "/config.yaml")
        cfg._yaml_code['genome']['genome_directory'] = "Saccer3"
        cfg._yaml_code['genome']['name'] = "Saccer3"
        cfg._yaml_code['genome']['fasta_file'] = "Saccer3/Saccer3.fa"
        cfg._yaml_code['genome']['fasta_file'] = "Saccer3/Saccer3.gff"
        cfg.save(self.wk + '/config.yaml')
Ejemplo n.º 30
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 31
0
def test_vcf_filter_dp4():

    data = sequana_data("test_vcf_mpileup_4dot1.vcf")
    v = VCF(data)
    variant = next(v.vcf)

    def validate_variant_alternatate(variant):
        # variant.ALT must be different from "." for this test
        assert str(variant.ALT[0]).strip() != "."

        # test minimum depth of alternate must be >= 4
        variant.INFO['DP4'] = [0, 0, 2, 2]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75)

        # here, not enough depth on alternate strand reverse or forward
        variant.INFO['DP4'] = [0, 0, 4, 1]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [0, 0, 1, 4]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

        # mimimum ratio must be > 0.75
        variant.INFO['DP4'] = [25, 0, 75, 75]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True

        variant.INFO['DP4'] = [25, 25, 75,
                               74]  # just below 0.75 for the alt reverse
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [25, 25, 74,
                               75]  # just below 0.75 for the alt forward
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

    # variant.ALT is equal to "A"
    validate_variant_alternatate(variant)

    def validate_variant_reference(variant):
        # variant.ALT must be different from "." for this test
        assert str(variant.ALT[0]).strip() == "."

        # test minimum depth of alternate must be >= 4
        variant.INFO['DP4'] = [2, 2, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75)

        # here, not enough depth on alternate strand reverse or forward
        variant.INFO['DP4'] = [4, 1, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [1, 4, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

        # mimimum ratio must be > 0.75
        variant.INFO['DP4'] = [75, 75, 25, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True

        variant.INFO['DP4'] = [75, 74, 25,
                               25]  # just below 0.75 for the alt reverse
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [74, 75, 25,
                               25]  # just below 0.75 for the alt forward
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

    # variant.ALT is equal to "A"
    variant.ALT[0].sequence = "."
    validate_variant_reference(variant)

    # Now, let us do the filtering with the vcf_filter method
    v = VCF(data)
    v.vcf.apply_dp4_filter = True
    with TempFile() as fh:
        res = v.vcf.filter_vcf(fh.name)
    assert res == {'N': 573, 'filtered': 414, 'unfiltered': 159}
Ejemplo n.º 32
0
def test_pacbio_stride():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    with TempFile() as fh:
        b.stride(fh.name, stride=2)
    with TempFile() as fh:
        b.stride(fh.name, stride=2, random=True)
Ejemplo n.º 33
0
def test_mkr():
    from sequana.kraken import MultiKrakenResults
    mkr = MultiKrakenResults([sequana_data("test_kraken_multiple_1.csv"),
            sequana_data('test_kraken_multiple_1.csv')])
    mkr.plot_stacked_hist(kind="bar")           
    mkr.plot_stacked_hist(kind="barh")           
Ejemplo n.º 34
0
def test_input():
    filename = sequana_data('test_gtf_fixer.gtf')

    with TempFile() as fout:
        gtf = GTFFixer(filename)
        gtf.fix(fout.name)
Ejemplo n.º 35
0
"""
read length histograms pacbio data
=====================================

QC pacbio example

"""

########################################
# First, let us get a data set example. 
# Note the .bam extension
from sequana import sequana_data
dataset  = sequana_data("test_pacbio_subreads.bam")

#############################################
# Create a :class:`sequana.pacbio.BAMPacbio` instance
from sequana.pacbio import BAMPacbio
qc = BAMPacbio(dataset)

#########################################
# plot the histogram of read length
qc.hist_len()


#################################################
# plot the histogram of the SNRs for each base
qc.hist_snr()
Ejemplo n.º 36
0
def test_pipeline_manager():

    # test missing input_directory
    cfg = SequanaConfig({})
    try:
        pm = snaketools.PipelineManager("custom", cfg)
        assert False
    except:
        assert True

    # normal behaviour but no input provided:
    config = Module("compressor")._get_config()
    cfg = SequanaConfig(config)
    cfg.cleanup() # remove templates
    try:
        pm = snaketools.PipelineManager("custome", cfg)
        assert False
    except:
        assert True

    cfg = SequanaConfig(config)
    cfg.cleanup() # remove templates
    file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")
    cfg.config.input_directory, cfg.config.input_pattern = os.path.split(file1)
    #file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz")
    pm = snaketools.PipelineManager("custom", cfg)
    assert pm.paired == False

    cfg = SequanaConfig(config)
    cfg.cleanup() # remove templates
    cfg.config.input_directory, cfg.config.input_pattern = os.path.split(file1)
    cfg.config.input_pattern = "Hm*gz"
    #file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")
    pm = snaketools.PipelineManager("custom", cfg)
    pm.plot_stats()
    assert pm.paired == True

    pm.getlogdir("fastqc")
    pm.getwkdir("fastqc")
    pm.getrawdata()
    pm.getreportdir("test")
    pm.getname("fastqc")

    # Test different configuration of input_directory, input_readtag,
    # input_pattern
    # Test the _R[12]_ paired
    with tempfile.TemporaryDirectory() as tmpdir:
        cfg = SequanaConfig()
        cfgname = tmpdir + "/config.yaml"
        cfg.config.input_pattern = "*fastq.gz"
        cfg.config.input_directory = tmpdir
        cfg.config.input_readtag = "_R[12]_"
        cfg._update_yaml()
        cfg.save(cfgname)
        cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        cmd = "touch {}/test_R2_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        pm = snaketools.PipelineManager("test", cfgname)
        assert pm.paired == True

    # Test the _[12]_ paired 
    with tempfile.TemporaryDirectory() as tmpdir:
        cfg = SequanaConfig()
        cfgname = tmpdir + "/config.yaml"
        cfg.config.input_pattern = "*fastq.gz"
        cfg.config.input_directory = tmpdir
        cfg.config.input_readtag = "_[12]."
        cfg._update_yaml()
        cfg.save(cfgname)
        cmd = "touch {}/test_1.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        cmd = "touch {}/test_2.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        pm = snaketools.PipelineManager("test", cfgname)
        assert pm.paired is True

    # Test the _R[12]_ single end
    with tempfile.TemporaryDirectory() as tmpdir:
        cfg = SequanaConfig()
        cfgname = tmpdir + "/config.yaml"
        cfg.config.input_pattern = "*fastq.gz"
        cfg.config.input_directory = tmpdir
        cfg.config.input_readtag = "_R[12]_"
        cfg._update_yaml()
        cfg.save(cfgname)
        cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        pm = snaketools.PipelineManager("test", cfgname)
        assert pm.paired is False

    # Test the _R[12]_ single end
    with tempfile.TemporaryDirectory() as tmpdir:
        cfg = SequanaConfig()
        cfgname = tmpdir + "/config.yaml"
        cfg.config.input_pattern = "*fq.gz" # wrong on purpose
        cfg.config.input_directory = tmpdir
        cfg.config.input_readtag = "_R[12]_"
        cfg._update_yaml()
        cfg.save(cfgname)
        cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        try:
            pm = snaketools.PipelineManager("test", cfgname)
            assert False
        except:
            assert True

    # Test the _R[12]_ single end
    with tempfile.TemporaryDirectory() as tmpdir:
        cfg = SequanaConfig()
        cfgname = tmpdir + "/config.yaml"
        cfg.config.input_pattern = "*fastq.gz" 
        cfg.config.input_directory = tmpdir
        cfg.config.input_readtag = "R[12]_"
        cfg._update_yaml()
        cfg.save(cfgname)
        cmd = "touch {}/testR1_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        cmd = "touch {}/testR2_.fastq.gz".format(tmpdir)
        subprocess.call(cmd.split())
        try:
            pm = snaketools.PipelineManager("test", cfgname)
            assert False
        except:
            assert True
Ejemplo n.º 37
0
def test_mRNA_inner_distance():
    b = BAM(sequana_data("test_hg38_chr18.bam"))
    df = b.mRNA_inner_distance(sequana_data("hg38_chr18.bed"))
    # Total read pairs  used 382
    # mean insert size: 88.3975155279503
    assert df[0]['val'].mean() > 1436 and df[0]['val'].mean() < 1437
Ejemplo n.º 38
0
def test_designMiSeq3():
    filename = sequana_data("test_expdesign_miseq_illumina_semicommas.csv")
    tt = ExpDesignMiSeq(filename)
    tt.df.Index1_ID[0] == 1
    assert tt.adapter_type == "NEXTFlex-PCRfree"
Ejemplo n.º 39
0
def test_designHiSeq():
    tt = ExpDesignHiSeq(sequana_data("test_expdesign_hiseq.csv", "testing"))
    assert list(tt.df['Index1_Seq'].values) == ['TTAGGC', None, 'ACTTGA']
Ejemplo n.º 40
0
def test_is_cram():
    datatest = sequana_data("test_measles.cram", "testing")
    assert is_cram(datatest) is True
Ejemplo n.º 41
0
"""
BAM module example
====================

Plot histogram of MAPQ values contained in a BAM file

"""

#################################################
# first import the relevant modules
from sequana import BAM, sequana_data

#####################################################
# Get a data set (BAM file) for testing
from sequana import BAM, sequana_data
datatest = sequana_data('test.bam', "testing")

##########################################################################
# Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram
b = BAM(datatest)
b.plot_bar_mapq()
Ejemplo n.º 42
0
def test_mkr2():
    from sequana.kraken import MultiKrakenResults2
    mkr = MultiKrakenResults2([sequana_data("test_kraken_mkr2_summary_1.json"),
            sequana_data('test_kraken_mkr2_summary_2.json')])
    mkr.plot_stacked_hist()
Ejemplo n.º 43
0
def test_spades():
    filename = sequana_data('test_contigs_spades.fasta')
    c = contigs.ContigsSpades(filename)
    c.hist_contig_length()
    c.plot_contig_length_vs_GC()
    c.scatter_length_cov_gc()
Ejemplo n.º 44
0
def test_bamreport(tmpdir):
    datatest = sequana_data("test.bam", "testing")
    directory = tmpdir.mkdir("bam")
    from sequana.utils import config
    config.output_dir = directory.__str__()
    r = BAMQCModule(datatest, "bam.html")
Ejemplo n.º 45
0
def test_designMiSeq2():
    filename = sequana_data("test_expdesign_miseq_illumina2.csv")
    tt = ExpDesignMiSeq(filename)
    tt.df.Index2_Seq[0] == "ACGTCTCG"
Ejemplo n.º 46
0
# is either C (classified) or U (unclassified) and the third column contains
# the taxon the most relevant.
#
# The taxon are not readable so we first need to get the scientific names
# Besides, the lineage would be useful. This is done in Sequana using
# the :class:`sequana.kraken.KrakenResults`. See following example.

##############################################
# Example
# --------
#
# In the following example, we use the results of a kraken analysis. The
# original toy data files contains 1500 reads mostly related to Measles virus
#
from sequana import KrakenResults, sequana_data
test_file = sequana_data("test_kraken.out", "testing")
k = KrakenResults(test_file)
df = k.plot(kind='pie')
print(df)

####################################################
# Note that only a subset of taxons are shown in the pie chart
# that is those that cover at least 1% of the total reads. Others
# are put together and labelled "others"
#
# A more interactive plot can be obtained using Krona if installed:
from sequana import KrakenResults, sequana_data
test_file = sequana_data("test_kraken.out", "testing")
import easydev
if easydev.cmd_exists("ktImportText"):
    k = KrakenResults(test_file)
Ejemplo n.º 47
0
 def _get_snpeff_config(self):
     """ Copy and unzip the snpEff.config file.
     """
     from sequana import sequana_data
     CONFIG = sequana_data("snpEff.config", "snpeff")
     shutil.copyfile(CONFIG, "./snpEff.config")
Ejemplo n.º 48
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title="",
                 clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:, "snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="A",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="C",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="G",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR),
                   alpha=alpha,
                   label="T",
                   bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Ejemplo n.º 49
0
def test_genomecov():
    filename = sequana_data('JB409847.bed')

    # wrong file
    try:
        bed = bedtools.GenomeCov("dummy.csv")
        assert False
    except:
        assert True

    # wrong threshold
    try:
        bed = bedtools.GenomeCov(filename, high_threshold=2)
        assert False
    except:
        assert True

    # wrong threshold
    try:
        bed = bedtools.GenomeCov(filename, low_threshold=-2)
        assert False
    except:
        assert True

    # wrong genbank
    try:
        bed = bedtools.GenomeCov(filename, "dummy.gbk")
        assert False
    except:
        assert True

    # !now let us read the good data sets by chunkd
    bed = bedtools.GenomeCov(filename,
                             sequana_data('JB409847.gbk'),
                             chunksize=5000)
    for c in bed.chr_list:
        c.run(1001, k=2)

    # setter must be bool
    try:
        bed.circular = 1
        assert False
    except:
        assert True

    # cant use setter
    try:
        bed.feature_dict = {}
        assert False
    except:
        assert True

    assert len(bed) == 1
    # a getter for the first chromosome
    bed[0]

    # setter available but not sure this is useful
    bed.window_size = 4000
    bed.window_size = 4001
    bed.hist()

    # This requires to call other method before
    for chrom in bed:
        chrom.moving_average(n=501)
        chrom.running_median(n=501, circular=True)
        chrom.running_median(n=501, circular=False)

        chrom.compute_zscore()
        roi = chrom.get_rois()
        with TempFile(suffix='.png') as fh:
            chrom.plot_coverage(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_zscore(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_normalized_coverage(filename=fh.name)

        len(chrom)
        print(chrom)
        chrom.get_size()
        chrom.DOC
        chrom.CV
    with TempFile(suffix='.csv') as fh:
        bed.gc_window_size = 100
        bed.to_csv(fh.name)

    # plotting
    bed.chr_list[0].plot_hist_coverage()
    bed.chr_list[0].plot_hist_coverage(logx=False, logy=True)
    bed.chr_list[0].plot_hist_coverage(logx=True, logy=False)
    with TempFile(suffix=".png") as fh:
        bed.chr_list[0].plot_hist_coverage(logx=False,
                                           logy=False,
                                           filename=fh.name)
Ejemplo n.º 50
0
def test_distance():
    data = sequana_data("test.bam", "testing")
    distances = bam_get_paired_distance(data)
Ejemplo n.º 51
0
def test_sequana_data():

    f1 = sequana_data("Institut_Pasteur.png")
    f2 = sequana_data("Institut_Pasteur.png", "images")
    assert f1 == f2
Ejemplo n.º 52
0
def test_vcf_filter_freebayes():

    data = sequana_data("test.vcf")
    v = VCF(data)
    v.hist_qual()
Ejemplo n.º 53
0
def test_bam2fastq():
    data = sequana_data("test.bam", "testing")
    res = bam_to_mapped_unmapped_fastq(data)
Ejemplo n.º 54
0
def test_mgi():
    m = MGI(sequana_data("test_mgi.fqStat.txt"))
    m.plot_acgt()
    m.boxplot_quality()
Ejemplo n.º 55
0
def test_gc_content():
    from sequana.tools import gc_content
    data = sequana_data('measles.fa', "testing")
    gc_content(data, 10)['chr1']
    gc_content(data, 101, circular=True)['chr1']
Ejemplo n.º 56
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
                    (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
                    (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True,
                                   fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
                "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" %
                 (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else:
        chrom_list = [options.chromosome]
    gc = GenomeCov(bedfile,
                   options.genbank,
                   options.low_threshold,
                   options.high_threshold,
                   options.double_threshold,
                   options.double_threshold,
                   chunksize=options.chunksize,
                   chromosome_list=chrom_list)

    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome < -1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(
            len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names  # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome - 1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            data = (this, gc.positions[this]["start"],
                    gc.positions[this]["end"])
            logger.info(
                "    {} (starting pos: {}, ending pos: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info(
                "==================== analysing chrom/contig %s/%s (%s)" %
                (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)

    if options.skip_multiqc is False:
        logger.info("=========================")
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
def test_variant_calling_module(tmpdir):
    directory = tmpdir.mkdir('test_variant_calling_module')
    config.output_dir = str(directory)
    config.sample_name = 'JB409847'
    VariantCallingModule(sequana_data('JB409847.vc.csv'))
Ejemplo n.º 58
0
def test_cutadapt_options():

    p = argparse.ArgumentParser()
    so = CutadaptOptions()
    so.add_options(p)

    # test the adapter choice
    for this in ["universal", "PCRFree", "none"]:
        options = {
            "cutadapt_adapter_choice": this,
            "cutadapt_design_file": None,
            "cutadapt_fwd": None,
            "cutadapt_rev": None,
            "skip_cutadapt": False,
        }
        #p.parse_args([])
        options = AttrDict(**options)
        so.check_options(options)

    # test for a valid design and adapter choice
    options = {
        "cutadapt_adapter_choice": "TruSeq",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    so.check_options(options)

    # test for a valid design but wrong adapter choice
    options = {
        "cutadapt_adapter_choice": "Nextera",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    # wrong combo (missing adapter choice)
    options = {
        "cutadapt_adapter_choice": None,
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": None,
        "cutadapt_rev": None,
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    # wrong quality (missing adapter choice)
    try:
        p.parse_args(["--cutadapt-quality", "-1"])
        assert False
    except:
        assert True
    p.parse_args(["--cutadapt-quality", "10"])

    # test for a valid design and adapter choice but also fwd/rev provided
    # whereas, we cannot do anything with this combo
    options = {
        "cutadapt_adapter_choice": "TruSeq",
        "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"),
        "cutadapt_fwd": "ACGT",  # dummy values
        "cutadapt_rev": "CGTA",  # dummy values
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    try:
        so.check_options(options)
        assert False
    except:
        assert True

    options = {
        "cutadapt_adapter_choice": None,
        "cutadapt_design_file": None,
        "cutadapt_fwd": sequana_data("TruSeqCD_DNA_fwd.fa"),
        "cutadapt_rev": sequana_data("TruSeqCD_DNA_rev.fa"),
        "skip_cutadapt": False,
    }
    options = AttrDict(**options)
    so.check_options(options)
Ejemplo n.º 59
0
def test_rnadiff_onefile():
    RNADIFF_DIR = sequana_data("rnadiff") + "/rnadiff_onecond_1"
    r = RNADiffResults(RNADIFF_DIR + "/tables/B3789-v1.surexpvsref.complete.xls")
    r.plot_count_per_sample()
    r.summary()
Ejemplo n.º 60
0
    def _get_summary_section(self):

        df = self._get_stats()
        if len(df) == 1 and df.iloc[0]['taxon'] == -1:
            pngimage = sequana_data("no_data.jpg")
            extra = "<p> no reads could be identified with the given the database(s)."
        else:
            pngimage = self.directory + os.sep + "kraken.png"
            extra = """<p>The following <b>clickable image</b> is a simplified 
version (only genus are shown) of an interactive and more detailled version 
based on Krona. Finally, note that the unclassified species in the pie plot 
may correspond to species not present in the data base or adapters (if not 
removed).</p>"""

        html = """
    <p>Overview of the Taxonomic content of the filtered reads. </p>
    <p>The taxonomic analysis is performed with Kraken (see database name in 
the configuration file. The analysis is performed with a Kmer
approach.
The details about the database itself are available in the <a
href="http://sequana.readthedocs.io">Sequana documentation</a>.
The taxonomic analysis should give a good idea of the content of the FastQ
files but should be used as a sanity check. Indeed, species absent
from the database won't be detected leading to false detection (close species 
may be detected instead). 
Besides, be aware that closely related species may not be classified precisely.
</p>

    {0}
    <div style="text-align:center"><a href="./{1}/kraken.html"> {2} </a></div>
    <br>
""".format(extra,
           self.directory.split(os.sep, 1)[1],
           self.png_to_embedded_png(pngimage))

        url_ncbi = "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}"
        df['links'] = [url_ncbi.format(taxon) for taxon in df['taxon']]
        datatable = DataTable(df, "kraken", index=False)
        # add links
        if "ena" in df.columns:
            urlena = "http://www.ebi.ac.uk/ena/data/view/"
            datatable.datatable.set_links_to_column(
                "ena", [urlena + this for this in df['ena']])

        datatable.datatable.set_links_to_column("links", "taxon")

        datatable.datatable.datatable_options = {
            'scrollX': '300px',
            'pageLength': 30,
            'scrollCollapse': 'true',
            'dom': 'Bfrtip',
            "paging": "false",
            "order": [[2, "desc"]],
            'buttons': ['copy', 'csv']
        }
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format='%.3g')

        html += "{} {}".format(html_tab, js)
        """# Rounding and convert in string to avoid exp notation
        df['percentage']  = df['percentage'].apply(lambda x: str(round(x,4)))
        #self.jinja['kraken_json'] = df.to_json()"""

        return html