def run_kraken_taxon(): def download(): kd = KrakenDownload() kd.download('toydb') database = sequana_config_path + os.sep + "kraken_toydb" if os.path.exists(database) is False: download() file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data") file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz", "data") kt = KrakenAnalysis([file1, file2], database=database) kt.run() kt = KrakenAnalysis(file2, database=database) kt.run() p = tempfile.TemporaryDirectory() kt = KrakenHierarchical([file1, file2], [database, database], output_directory=p.name, force=True) kt.run() kt = KrakenHierarchical(file1, [database, database], output_directory=p.name, force=True) kt.run() p.cleanup()
def test_snpeff(): # a custom refrence fh_log = TempFile() mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk"), log=fh_log.name) with TempFile() as fh: mydata.launch_snpeff(sequana_data("JB409847.vcf"), fh.name) fh_log.delete() # cleanup try: os.remove("snpEff.config") except: pass try: os.remove("snpEff_genes.txt") except: pass try: os.remove("snpEff_summary.html") except: pass try: snpeff.SnpEff(reference="dummy") assert False except SystemExit: assert True except: assert False
def test_pacbio_input_bam(tmpdir): # we need a summary and a bunch of images filename = sequana_data("summary_pacbio_qc1.json") # mock the PNG files found in the summary import json summary = json.load(open(filename)) pngname = sequana_data("no_data.jpg") summary["images"]["gc_vs_length"] = pngname summary["images"]["hist_gc_content"] = pngname summary["images"]["hist_read_length"] = pngname summary["images"]["hist_snr"] = pngname summary["images"]["hist_zmw"] = pngname summary_file = TempFile() with open(summary_file.name, "w") as ff: json.dump(summary, ff) # Now that we have this new summary file, let us use it # we also need an output handler ff = TempFile() from sequana.utils import config config.output_dir = "/tmp" #here, ff.name is of the form /tmp/djhfjh4dz so we need to remove the /tmp pacbio_input_bam.PacbioInputBAMModule(summary_file.name, ff.name.split("/")[1]) # cleanup summary_file.delete() ff.delete()
def __init__(self, wk=None): super(VariantCallingPipeline, self).__init__(wk=wk) # Define the data data = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz") input_directory = os.path.dirname(data) self.input_pattern = input_directory + "/Hm*gz" self.pipeline = "variant_calling" # Define the project and config file subprocess.check_call([ "sequana", "--pipeline", self.pipeline, "--input-pattern", '%s' % self.input_pattern, "--working-directory", self.wk, "--force" ]) cmd = ["sequana", "--pipeline", self.pipeline, "--input-pattern", '%s'% self.input_pattern, "--working-directory", self.wk, "--force"] if "TRAVIS_PYTHON_VERSION" in os.environ: cmd += ["--snakemake-jobs", "1"] subprocess.check_call(cmd) # Add reference in the config cfg = SequanaConfig(self.wk + "/config.yaml") # We added a TTTT in position 5881 cfg._yaml_code['bwa_mem_ref']['reference'] = sequana_data("measles.fa") cfg.save(self.wk + '/config.yaml')
def test_sequana_data(): try: sequana_data() assert False except ValueError: assert True
def test_add_locus_with_modification(): # Alter the original GBK to alter the locus name data = open(sequana_data("JB409847.gbk"), "r").read() newdata = data.replace("JB409847", "DUMMY_JB409847") fh = TempFile(suffix="gbk") with open(fh.name, 'w') as fout: fout.write(newdata) # Now we read this new GBK file that has a different locus name as # compared to the fasta mydata = snpeff.SnpEff(reference=fh.name) # Here is the corresponding FASTA fasta = sequana_data("JB409847.fasta") with TempFile(suffix="fasta") as fh2: mydata.add_locus_in_fasta(fasta, fh2.name) # In theory, in the newly created fasta file, we should find back the # DUMMY tag # cleanup try: os.remove("snpEff.config") except: pass data = open(fh2.name, "r").read() assert "DUMMY" in data fh.delete()
def test_is_sam_bam(): datatest = sequana_data("test_measles.sam", "testing") assert is_sam(datatest) is True datatest = sequana_data("test_measles.bam", "testing") assert is_bam(datatest) is True
def test_input(): filename = sequana_data('virus.bed', 'data') df = summary.main([prog, '--file', filename]) len(df) filename = sequana_data('test.fastq', "testing") df = summary.main([prog, '--file', filename])
def test_ChromosomeCovMultiChunk(): filename = sequana_data('JB409847.bed') # using chunksize of 7000, we test odd number bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'),chunksize=7000) chrom = bed.chr_list[0] res = chrom.run(501, k=2, circular=True) res.get_summary() res.get_rois()
def test_pcrfree(): design = sequana_data("test_index_mapper.csv") try: ad = FindAdaptersFromDesign(design, "error") assert False except Exception: assert True # Other input from PCRFree ad = FindAdaptersFromDesign(design, "PCRFree") # Test the index1/2_seq with 3 cases # index1 present only, # no index at all (None) # index1 and index2 present design1 = sequana_data("test_expdesign_hiseq.csv") ad1 = FindAdaptersFromDesign(design1, "PCRFree") ad1.check() res1 = ad1.get_adapters_from_sample("553-iH2-1") res2 = ad1.get_adapters_from_sample("539-st2") res3 = ad1.get_adapters_from_sample("107-st2") assert res1['index1']['fwd'].identifier == "NextFlex_PCR_Free_adapter8|name:8|seq:TTAGGC" assert res1['index1']['fwd'].name == "8" assert res1['index1']['rev'].name == "8" assert list(res2.keys()) == ["universal"] assert res3['index1']['fwd'].name == "9" assert res3['index1']['rev'].name == "9" assert res3['index2']['fwd'].name == "10" assert res3['index2']['rev'].name == "10" # double indexing # This is a double indexing for PCRFree, which has not been tested # since it requires 16S adapters not yet in sequana """design2 = sequana_data("test_expdesign_miseq_illumina2.csv") ad2 = FindAdaptersFromDesign(design2, "PCRFree") assert ad2.get_adapters_from_sample('M2')['index1']['fwd'].identifier == \ 'NextFlex_PCR_Free_adapter2|name:2|seq:TGACCA' assert ad2.get_adapters_from_sample('M2')['index2']['fwd'].identifier == \ 'NextFlex_PCR_Free_adapter13|name:13|seq:AGTCAA' """ design = sequana_data("test_expdesign_miseq_illumina.csv") ad = FindAdaptersFromDesign(design, "PCRFree") res = ad.get_adapters_from_sample("CR81-L1236-P1") assert res['index1']['fwd'].identifier == 'NextFlex_PCR_Free_adapter1|name:1|seq:CGATGT' design1 = sequana_data("test_expdesign_miseq_illumina_1.csv") ad = FindAdaptersFromDesign(design1, "PCRFree") ad.check() # all sample names must be found res = ad.get_adapters_from_sample("CM-2685")['index1']['fwd'] assert res.name == "3"
def test_atropos_paired(tmpdir): # This is for the new version of cutadapt with version 1.1 directory = tmpdir.mkdir('test_module') config.output_dir = str(directory) config.sample_name = 'JB409847' c = CutadaptModule(sequana_data('test_atropos_pe.txt'), "TEST", "test.html") assert c.jinja['mode'] == 'Paired-end' c = CutadaptModule(sequana_data('test_atropos_se.txt'), "TEST", "test.html") assert c.jinja['mode'] == 'Singled-end'
def test_sequana_data_star(): # all files in a specific directory (a list) f1 = sequana_data("*", "images") assert isinstance(f1, list) assert 'Institut_Pasteur.png' in f1 # all files (return a dict) f1 = sequana_data("*") assert isinstance(f1, dict)
def test_krona_merger(): k1 = KronaMerger(sequana_data("test_krona_k1.tsv")) k2 = KronaMerger(sequana_data("test_krona_k2.tsv")) k1 += k2 with TempFile(suffix='.tsv') as fh: df = k1.to_tsv(fh.name) assert all(df['count'] == [14043,591,184,132]) assert k1['Bacteria\tProteobacteria\tspecies1\n'] == 14043
def test_bcf_filter(): vcf_output_expected = sequana_data('JB409847.filter.vcf') bcf = BCF_freebayes(sequana_data('JB409847.bcf')) filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3} filter_bcf = bcf.filter_bcf(filter_dict) with TempFile(suffix='.vcf') as fp: filter_bcf.to_vcf(fp.name) compare_file = filecmp.cmp(fp.name, vcf_output_expected) assert compare_file
def test_add_locus_no_modification(): mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk")) with TempFile() as fh: fastafile = sequana_data("JB409847.fasta") mydata.add_locus_in_fasta(fastafile, fh.name) # cleanup try: os.remove("snpEff.config") except: pass
def test_sequana_data(): try: sequana_data() assert False except ValueError: assert True except: assert False sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz", "data")
def test_coverage_module(tmpdir): bed = bedtools.GenomeCov(sequana_data("JB409847.bed")) fasta = sequana_data("JB409847.fasta") bed.compute_gc_content(fasta) c = bed.chr_list[0] c.run(4001) directory = tmpdir.mkdir('test_coverage_module') config.output_dir = str(directory) config.sample_name = "JB409847" CoverageModule(bed)
def test_adapter_reader(): from sequana.adapters import AdapterReader as AR data = sequana_data("adapters_with_duplicates.fa", "testing") try: ar = AR(data) ar.sanity_check() except ValueError: pass data1 = sequana_data("adapters_Nextera_fwd.fa") data2 = sequana_data("adapters_Nextera_rev.fa") data3 = sequana_data("adapters_Nextera_revcomp.fa") # try different constructors ar1 = AR(data1) ar1.__repr__() ar1.index_sequences ar_same = AR(ar1._data) # from a list of dictionaries assert ar1 == ar_same ar_same = AR(ar1) # from a AR instance assert ar1 == ar_same assert ar1[0]['identifier'] == 'Universal_Adapter|name:universal' ar1.index_names assert ar1.get_adapter_by_sequence("XXX") is None try: ar1.get_adapter_by_identifier("XXX") assert False except ValueError: assert True # __eq__ assert len(ar1) == 56 # accessors ar1.sequences, ar1.identifiers, ar1.comments ar1.get_adapter_by_sequence("ACGT") assert ar1.get_adapter_by_index_name("dummy") is None assert ar1.get_adapter_by_identifier("Nextera_index_N517") ar2 = AR(data2) ar2.reverse() # fails due to S516 ???????? assert ar1 == ar2 ar3 = AR(data3) ar3.reverse_complement() assert ar1 == ar3 # test to_fasta method with TempFile() as fh: ar1.to_fasta(fh.name)
def test_canvasjs_linegraph(): bed = bedtools.GenomeCov(sequana_data("JB409847.bed")) fasta = sequana_data("JB409847.fasta") bed.compute_gc_content(fasta) c = bed.chr_list[0] c.run(4001) df = bed[0].df csv = df.to_csv(columns=['pos', 'cov', 'gc'], index=False, float_format='%.3g') # create CanvasJS stuff cjs = CanvasJSLineGraph(csv, 'cov', 'pos', ['cov', 'gc']) # set options cjs.set_options({'zoomEnabled': 'true', 'zoomType': 'x', 'exportEnabled': 'true'}) # set title cjs.set_title("Genome Coverage") # set legend cjs.set_legend({'verticalAlign': 'bottom', 'horizontalAlign': 'center', 'cursor':'pointer'}, hide_on_click=True) # set axis cjs.set_axis_x({'title': "Position (bp)", 'labelAngle': 30, 'minimum': 0, 'maximum': len(df)}) cjs.set_axis_y({'title': "Coverage (Count)"}) cjs.set_axis_y2({'title': "GC content (ratio)", 'minimum':0, 'maximum': 1, 'lineColor': '#FFC425', 'titleFontColor': '#FFC425', 'labelFontColor': '#FFC425'}) # set datas cjs.set_data(index=0, data_dict={'type': 'line', 'name': "Coverage", 'showInLegend': 'true', 'color': '#5BC0DE', 'lineColor': '#5BC0DE'}) cjs.set_data(index=1, data_dict={'type': 'line', 'axisYType': 'secondary', 'name': "GC content", 'showInLegend': 'true', 'color': '#FFC425', 'lineColor': '#FFC425'}) # create canvasJS cjs.create_canvasjs()
def test_analysis(): file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz") file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz") reference = sequana_data("measles.fa") from tempfile import TemporaryDirectory directory = TemporaryDirectory() shutil.copy(file1, directory.name) shutil.copy(file2, directory.name) shutil.copy(reference, directory.name) df = mapping.main([prog, '--file1', file1, "--file2", file2, "--reference", reference])
def test_design_constructor(): filename = sequana_data("test_expdesign_miseq_illumina.csv") tt = ExpDesignMiSeq(filename) # using existing design tt = ExpDesignAdapter(tt) # or a filename tt = ExpDesignAdapter(filename) # constructor for hiseq tt = ExpDesignAdapter(sequana_data("test_expdesign_hiseq.csv")) tt = ExpDesignAdapter(sequana_data("test_expdesign_generic.csv")) tt print(tt)
def save_config_file(self, filename): from sequana import sequana_data config_generic = sequana_data("config.ini", "busco") data = open(config_generic, "r").read() data = data.format(**self.params) with open(filename, "w") as fh: fh.write(data)
def test_duplicated_design(): filename = sequana_data("test_expdesign_hiseq_duplicated_index.csv") ss = FindAdaptersFromDesign(filename, "Small") res = ss.get_adapters_from_sample("VB-22") assert res['index1']['fwd'].identifier == "Small_Adapter_5|name:small5|seq:ACAGTG" assert res['index1']['fwd'].sequence == "CAAGCAGAAGACGGCATACGAGATACAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA"
def test_pbsim(): filename = sequana_data("test_pacbio_subreads.bam") ss = PBSim(filename, filename) with TempFile() as fh: ss.run(bins=100, step=50, output_filename=fh.name) from pylab import close close()
def test_to_csv(): filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3} bcf = BCF_freebayes(sequana_data('JB409847.bcf')) filter_bcf = bcf.filter_bcf(filter_dict) with TempFile(suffix='.csv') as ft: filter_bcf.to_csv(ft.name)
def test_file_name_factory(): import glob def inner_test(ff): len(ff) print(ff) ff.filenames ff.realpaths ff.all_extensions ff.pathnames ff.extensions #list list_files = glob.glob("*.py") ff = snaketools.FileFactory(list_files) inner_test(ff) # glob ff = snaketools.FileFactory("*py") inner_test(ff) directory = os.path.dirname(sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")) ff = snaketools.FastQFactory(directory + "/Hm2*fastq.gz", verbose=True) assert ff.tags == ['Hm2_GTGAAA_L005'] ff.get_file1(ff.tags[0]) ff.get_file2(ff.tags[0]) assert len(ff) == 1
def test_cram(): datatest = sequana_data("test_measles.cram", "testing") s = CRAM(datatest) assert s.summary == {'flags': {77: 6, 83: 14, 99: 10, 141: 6, 147: 10, 163: 14}, 'mapq': {0: 12, 60: 48}, 'mean_quality': 33.666171617161723, 'read_length': {79: 2, 81: 1, 93: 1, 101: 44}}
def test_sam(tmpdir): datatest = sequana_data("test.sam", "testing") s = SAM(datatest) assert len(s) == 432 assert s.is_sorted is True assert s.is_paired is True df = s.get_df_concordance(max_align=100)
def __init__(self, wk=None): super(RNASeqPipeline, self).__init__(wk) data = sequana_data("KO_ATCACG_R1_test.fastq.gz") input_directory = os.path.dirname(data) self.input_pattern = input_directory + "/KO_ATCACG_R1_test.fastq.gz" self.pipeline = "rnaseq" #self.output = self.wk + "/Hm2_GTGAAA_L005/report_qc_Hm2_GTGAAA_L005/summary.json" subprocess.check_call([ "sequana", "--pipeline", self.pipeline, "--input-pattern", '%s'% self.input_pattern, "--working-directory", self.wk, "--adapter-fwd", "GATCGGAAGAGCACACGTCTGAACTCCAGTCA", "--adapter-rev", "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC", "--force"]) # Need to edit the config file cfg = SequanaConfig(self.wk + "/config.yaml") cfg._yaml_code['genome']['genome_directory'] = "Saccer3" cfg._yaml_code['genome']['name'] = "Saccer3" cfg._yaml_code['genome']['fasta_file'] = "Saccer3/Saccer3.fa" cfg._yaml_code['genome']['fasta_file'] = "Saccer3/Saccer3.gff" cfg.save(self.wk + '/config.yaml')
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def test_vcf_filter_dp4(): data = sequana_data("test_vcf_mpileup_4dot1.vcf") v = VCF(data) variant = next(v.vcf) def validate_variant_alternatate(variant): # variant.ALT must be different from "." for this test assert str(variant.ALT[0]).strip() != "." # test minimum depth of alternate must be >= 4 variant.INFO['DP4'] = [0, 0, 2, 2] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) # here, not enough depth on alternate strand reverse or forward variant.INFO['DP4'] = [0, 0, 4, 1] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [0, 0, 1, 4] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # mimimum ratio must be > 0.75 variant.INFO['DP4'] = [25, 0, 75, 75] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True variant.INFO['DP4'] = [25, 25, 75, 74] # just below 0.75 for the alt reverse assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [25, 25, 74, 75] # just below 0.75 for the alt forward assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # variant.ALT is equal to "A" validate_variant_alternatate(variant) def validate_variant_reference(variant): # variant.ALT must be different from "." for this test assert str(variant.ALT[0]).strip() == "." # test minimum depth of alternate must be >= 4 variant.INFO['DP4'] = [2, 2, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) # here, not enough depth on alternate strand reverse or forward variant.INFO['DP4'] = [4, 1, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [1, 4, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # mimimum ratio must be > 0.75 variant.INFO['DP4'] = [75, 75, 25, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True variant.INFO['DP4'] = [75, 74, 25, 25] # just below 0.75 for the alt reverse assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [74, 75, 25, 25] # just below 0.75 for the alt forward assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # variant.ALT is equal to "A" variant.ALT[0].sequence = "." validate_variant_reference(variant) # Now, let us do the filtering with the vcf_filter method v = VCF(data) v.vcf.apply_dp4_filter = True with TempFile() as fh: res = v.vcf.filter_vcf(fh.name) assert res == {'N': 573, 'filtered': 414, 'unfiltered': 159}
def test_pacbio_stride(): b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) with TempFile() as fh: b.stride(fh.name, stride=2) with TempFile() as fh: b.stride(fh.name, stride=2, random=True)
def test_mkr(): from sequana.kraken import MultiKrakenResults mkr = MultiKrakenResults([sequana_data("test_kraken_multiple_1.csv"), sequana_data('test_kraken_multiple_1.csv')]) mkr.plot_stacked_hist(kind="bar") mkr.plot_stacked_hist(kind="barh")
def test_input(): filename = sequana_data('test_gtf_fixer.gtf') with TempFile() as fout: gtf = GTFFixer(filename) gtf.fix(fout.name)
""" read length histograms pacbio data ===================================== QC pacbio example """ ######################################## # First, let us get a data set example. # Note the .bam extension from sequana import sequana_data dataset = sequana_data("test_pacbio_subreads.bam") ############################################# # Create a :class:`sequana.pacbio.BAMPacbio` instance from sequana.pacbio import BAMPacbio qc = BAMPacbio(dataset) ######################################### # plot the histogram of read length qc.hist_len() ################################################# # plot the histogram of the SNRs for each base qc.hist_snr()
def test_pipeline_manager(): # test missing input_directory cfg = SequanaConfig({}) try: pm = snaketools.PipelineManager("custom", cfg) assert False except: assert True # normal behaviour but no input provided: config = Module("compressor")._get_config() cfg = SequanaConfig(config) cfg.cleanup() # remove templates try: pm = snaketools.PipelineManager("custome", cfg) assert False except: assert True cfg = SequanaConfig(config) cfg.cleanup() # remove templates file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz") cfg.config.input_directory, cfg.config.input_pattern = os.path.split(file1) #file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz") pm = snaketools.PipelineManager("custom", cfg) assert pm.paired == False cfg = SequanaConfig(config) cfg.cleanup() # remove templates cfg.config.input_directory, cfg.config.input_pattern = os.path.split(file1) cfg.config.input_pattern = "Hm*gz" #file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz") pm = snaketools.PipelineManager("custom", cfg) pm.plot_stats() assert pm.paired == True pm.getlogdir("fastqc") pm.getwkdir("fastqc") pm.getrawdata() pm.getreportdir("test") pm.getname("fastqc") # Test different configuration of input_directory, input_readtag, # input_pattern # Test the _R[12]_ paired with tempfile.TemporaryDirectory() as tmpdir: cfg = SequanaConfig() cfgname = tmpdir + "/config.yaml" cfg.config.input_pattern = "*fastq.gz" cfg.config.input_directory = tmpdir cfg.config.input_readtag = "_R[12]_" cfg._update_yaml() cfg.save(cfgname) cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) cmd = "touch {}/test_R2_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) pm = snaketools.PipelineManager("test", cfgname) assert pm.paired == True # Test the _[12]_ paired with tempfile.TemporaryDirectory() as tmpdir: cfg = SequanaConfig() cfgname = tmpdir + "/config.yaml" cfg.config.input_pattern = "*fastq.gz" cfg.config.input_directory = tmpdir cfg.config.input_readtag = "_[12]." cfg._update_yaml() cfg.save(cfgname) cmd = "touch {}/test_1.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) cmd = "touch {}/test_2.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) pm = snaketools.PipelineManager("test", cfgname) assert pm.paired is True # Test the _R[12]_ single end with tempfile.TemporaryDirectory() as tmpdir: cfg = SequanaConfig() cfgname = tmpdir + "/config.yaml" cfg.config.input_pattern = "*fastq.gz" cfg.config.input_directory = tmpdir cfg.config.input_readtag = "_R[12]_" cfg._update_yaml() cfg.save(cfgname) cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) pm = snaketools.PipelineManager("test", cfgname) assert pm.paired is False # Test the _R[12]_ single end with tempfile.TemporaryDirectory() as tmpdir: cfg = SequanaConfig() cfgname = tmpdir + "/config.yaml" cfg.config.input_pattern = "*fq.gz" # wrong on purpose cfg.config.input_directory = tmpdir cfg.config.input_readtag = "_R[12]_" cfg._update_yaml() cfg.save(cfgname) cmd = "touch {}/test_R1_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) try: pm = snaketools.PipelineManager("test", cfgname) assert False except: assert True # Test the _R[12]_ single end with tempfile.TemporaryDirectory() as tmpdir: cfg = SequanaConfig() cfgname = tmpdir + "/config.yaml" cfg.config.input_pattern = "*fastq.gz" cfg.config.input_directory = tmpdir cfg.config.input_readtag = "R[12]_" cfg._update_yaml() cfg.save(cfgname) cmd = "touch {}/testR1_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) cmd = "touch {}/testR2_.fastq.gz".format(tmpdir) subprocess.call(cmd.split()) try: pm = snaketools.PipelineManager("test", cfgname) assert False except: assert True
def test_mRNA_inner_distance(): b = BAM(sequana_data("test_hg38_chr18.bam")) df = b.mRNA_inner_distance(sequana_data("hg38_chr18.bed")) # Total read pairs used 382 # mean insert size: 88.3975155279503 assert df[0]['val'].mean() > 1436 and df[0]['val'].mean() < 1437
def test_designMiSeq3(): filename = sequana_data("test_expdesign_miseq_illumina_semicommas.csv") tt = ExpDesignMiSeq(filename) tt.df.Index1_ID[0] == 1 assert tt.adapter_type == "NEXTFlex-PCRfree"
def test_designHiSeq(): tt = ExpDesignHiSeq(sequana_data("test_expdesign_hiseq.csv", "testing")) assert list(tt.df['Index1_Seq'].values) == ['TTAGGC', None, 'ACTTGA']
def test_is_cram(): datatest = sequana_data("test_measles.cram", "testing") assert is_cram(datatest) is True
""" BAM module example ==================== Plot histogram of MAPQ values contained in a BAM file """ ################################################# # first import the relevant modules from sequana import BAM, sequana_data ##################################################### # Get a data set (BAM file) for testing from sequana import BAM, sequana_data datatest = sequana_data('test.bam', "testing") ########################################################################## # Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram b = BAM(datatest) b.plot_bar_mapq()
def test_mkr2(): from sequana.kraken import MultiKrakenResults2 mkr = MultiKrakenResults2([sequana_data("test_kraken_mkr2_summary_1.json"), sequana_data('test_kraken_mkr2_summary_2.json')]) mkr.plot_stacked_hist()
def test_spades(): filename = sequana_data('test_contigs_spades.fasta') c = contigs.ContigsSpades(filename) c.hist_contig_length() c.plot_contig_length_vs_GC() c.scatter_length_cov_gc()
def test_bamreport(tmpdir): datatest = sequana_data("test.bam", "testing") directory = tmpdir.mkdir("bam") from sequana.utils import config config.output_dir = directory.__str__() r = BAMQCModule(datatest, "bam.html")
def test_designMiSeq2(): filename = sequana_data("test_expdesign_miseq_illumina2.csv") tt = ExpDesignMiSeq(filename) tt.df.Index2_Seq[0] == "ACGTCTCG"
# is either C (classified) or U (unclassified) and the third column contains # the taxon the most relevant. # # The taxon are not readable so we first need to get the scientific names # Besides, the lineage would be useful. This is done in Sequana using # the :class:`sequana.kraken.KrakenResults`. See following example. ############################################## # Example # -------- # # In the following example, we use the results of a kraken analysis. The # original toy data files contains 1500 reads mostly related to Measles virus # from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') print(df) #################################################### # Note that only a subset of taxons are shown in the pie chart # that is those that cover at least 1% of the total reads. Others # are put together and labelled "others" # # A more interactive plot can be obtained using Krona if installed: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") import easydev if easydev.cmd_exists("ktImportText"): k = KrakenResults(test_file)
def _get_snpeff_config(self): """ Copy and unzip the snpEff.config file. """ from sequana import sequana_data CONFIG = sequana_data("snpEff.config", "snpeff") shutil.copyfile(CONFIG, "./snpEff.config")
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:, "snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def test_genomecov(): filename = sequana_data('JB409847.bed') # wrong file try: bed = bedtools.GenomeCov("dummy.csv") assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, high_threshold=2) assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, low_threshold=-2) assert False except: assert True # wrong genbank try: bed = bedtools.GenomeCov(filename, "dummy.gbk") assert False except: assert True # !now let us read the good data sets by chunkd bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'), chunksize=5000) for c in bed.chr_list: c.run(1001, k=2) # setter must be bool try: bed.circular = 1 assert False except: assert True # cant use setter try: bed.feature_dict = {} assert False except: assert True assert len(bed) == 1 # a getter for the first chromosome bed[0] # setter available but not sure this is useful bed.window_size = 4000 bed.window_size = 4001 bed.hist() # This requires to call other method before for chrom in bed: chrom.moving_average(n=501) chrom.running_median(n=501, circular=True) chrom.running_median(n=501, circular=False) chrom.compute_zscore() roi = chrom.get_rois() with TempFile(suffix='.png') as fh: chrom.plot_coverage(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_zscore(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_normalized_coverage(filename=fh.name) len(chrom) print(chrom) chrom.get_size() chrom.DOC chrom.CV with TempFile(suffix='.csv') as fh: bed.gc_window_size = 100 bed.to_csv(fh.name) # plotting bed.chr_list[0].plot_hist_coverage() bed.chr_list[0].plot_hist_coverage(logx=False, logy=True) bed.chr_list[0].plot_hist_coverage(logx=True, logy=False) with TempFile(suffix=".png") as fh: bed.chr_list[0].plot_hist_coverage(logx=False, logy=False, filename=fh.name)
def test_distance(): data = sequana_data("test.bam", "testing") distances = bam_get_paired_distance(data)
def test_sequana_data(): f1 = sequana_data("Institut_Pasteur.png") f2 = sequana_data("Institut_Pasteur.png", "images") assert f1 == f2
def test_vcf_filter_freebayes(): data = sequana_data("test.vcf") v = VCF(data) v.hist_qual()
def test_bam2fastq(): data = sequana_data("test.bam", "testing") res = bam_to_mapped_unmapped_fastq(data)
def test_mgi(): m = MGI(sequana_data("test_mgi.fqStat.txt")) m.plot_acgt() m.boxplot_quality()
def test_gc_content(): from sequana.tools import gc_content data = sequana_data('measles.fa', "testing") gc_content(data, 10)['chr1'] gc_content(data, 101, circular=True)['chr1']
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.logging_level if options.download_reference: logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database)) from bioservices.apps import download_fasta as df df.download_fasta(options.download_reference, method=options.database) if options.download_genbank is None: return if options.download_genbank: logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database)) from sequana.snpeff import download_fasta_and_genbank download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False) return if options.genbank: assert os.path.exists(options.genbank), \ "%s does not exists" % options.genbank logger.info("Reading %s. This may take time depending on " "your input file" % options.input) # Convert BAM to BED if options.input.endswith(".bam"): bedfile = options.input.replace(".bam", ".bed") logger.info("Converting BAM into BED file") shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile)) elif options.input.endswith(".bed"): bedfile = options.input else: raise ValueError("Input file must be a BAM or BED file") # Set the thresholds if options.low_threshold is None: options.low_threshold = -options.threshold if options.high_threshold is None: options.high_threshold = options.threshold # and output directory config.output_dir = options.output_directory config.sample_name = os.path.basename(options.input).split('.')[0] # Now we can create the instance of GenomeCoverage if options.chromosome == -1: chrom_list = [] else: chrom_list = [options.chromosome] gc = GenomeCov(bedfile, options.genbank, options.low_threshold, options.high_threshold, options.double_threshold, options.double_threshold, chunksize=options.chunksize, chromosome_list=chrom_list) # if we have the reference, let us use it if options.reference: logger.info('Computing GC content') gc.compute_gc_content(options.reference, options.w_gc, options.circular) # Now we scan the chromosomes, if len(gc.chrom_names) == 1: logger.warning("There is only one chromosome. Selected automatically.") run_analysis(gc.chr_list[0], options, gc.feature_dict) elif options.chromosome < -1 or options.chromosome > len(gc.chrom_names): msg = "invalid chromosome index; must be in [1;{}]".format( len(gc.chrom_names)) logger.error(msg) sys.exit(1) else: if options.chromosome == -1: chromosomes = gc.chrom_names # take all chromosomes else: # For user, we start at position 1 but in python, we start at zero chromosomes = [gc.chrom_names[options.chromosome - 1]] logger.info("There are %s chromosomes/contigs." % len(gc)) for this in gc.chrom_names: data = (this, gc.positions[this]["start"], gc.positions[this]["end"]) logger.info( " {} (starting pos: {}, ending pos: {})".format(*data)) # here we read chromosome by chromosome to save memory. # However, if the data is small. for i, chrom in enumerate(chromosomes): logger.info( "==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i])) # since we read just one contig/chromosome, the chr_list contains # only one contig, so we access to it with index 0 run_analysis(gc.chr_list[i], options, gc.feature_dict) if options.skip_multiqc is False: logger.info("=========================") logger.info("Creating multiqc report") pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/") cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg) import subprocess proc = subprocess.Popen(cmd.split(), cwd=options.output_directory) proc.wait()
def test_variant_calling_module(tmpdir): directory = tmpdir.mkdir('test_variant_calling_module') config.output_dir = str(directory) config.sample_name = 'JB409847' VariantCallingModule(sequana_data('JB409847.vc.csv'))
def test_cutadapt_options(): p = argparse.ArgumentParser() so = CutadaptOptions() so.add_options(p) # test the adapter choice for this in ["universal", "PCRFree", "none"]: options = { "cutadapt_adapter_choice": this, "cutadapt_design_file": None, "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } #p.parse_args([]) options = AttrDict(**options) so.check_options(options) # test for a valid design and adapter choice options = { "cutadapt_adapter_choice": "TruSeq", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) so.check_options(options) # test for a valid design but wrong adapter choice options = { "cutadapt_adapter_choice": "Nextera", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True # wrong combo (missing adapter choice) options = { "cutadapt_adapter_choice": None, "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": None, "cutadapt_rev": None, "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True # wrong quality (missing adapter choice) try: p.parse_args(["--cutadapt-quality", "-1"]) assert False except: assert True p.parse_args(["--cutadapt-quality", "10"]) # test for a valid design and adapter choice but also fwd/rev provided # whereas, we cannot do anything with this combo options = { "cutadapt_adapter_choice": "TruSeq", "cutadapt_design_file": sequana_data("test_expdesign_Hm2.csv"), "cutadapt_fwd": "ACGT", # dummy values "cutadapt_rev": "CGTA", # dummy values "skip_cutadapt": False, } options = AttrDict(**options) try: so.check_options(options) assert False except: assert True options = { "cutadapt_adapter_choice": None, "cutadapt_design_file": None, "cutadapt_fwd": sequana_data("TruSeqCD_DNA_fwd.fa"), "cutadapt_rev": sequana_data("TruSeqCD_DNA_rev.fa"), "skip_cutadapt": False, } options = AttrDict(**options) so.check_options(options)
def test_rnadiff_onefile(): RNADIFF_DIR = sequana_data("rnadiff") + "/rnadiff_onecond_1" r = RNADiffResults(RNADIFF_DIR + "/tables/B3789-v1.surexpvsref.complete.xls") r.plot_count_per_sample() r.summary()
def _get_summary_section(self): df = self._get_stats() if len(df) == 1 and df.iloc[0]['taxon'] == -1: pngimage = sequana_data("no_data.jpg") extra = "<p> no reads could be identified with the given the database(s)." else: pngimage = self.directory + os.sep + "kraken.png" extra = """<p>The following <b>clickable image</b> is a simplified version (only genus are shown) of an interactive and more detailled version based on Krona. Finally, note that the unclassified species in the pie plot may correspond to species not present in the data base or adapters (if not removed).</p>""" html = """ <p>Overview of the Taxonomic content of the filtered reads. </p> <p>The taxonomic analysis is performed with Kraken (see database name in the configuration file. The analysis is performed with a Kmer approach. The details about the database itself are available in the <a href="http://sequana.readthedocs.io">Sequana documentation</a>. The taxonomic analysis should give a good idea of the content of the FastQ files but should be used as a sanity check. Indeed, species absent from the database won't be detected leading to false detection (close species may be detected instead). Besides, be aware that closely related species may not be classified precisely. </p> {0} <div style="text-align:center"><a href="./{1}/kraken.html"> {2} </a></div> <br> """.format(extra, self.directory.split(os.sep, 1)[1], self.png_to_embedded_png(pngimage)) url_ncbi = "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}" df['links'] = [url_ncbi.format(taxon) for taxon in df['taxon']] datatable = DataTable(df, "kraken", index=False) # add links if "ena" in df.columns: urlena = "http://www.ebi.ac.uk/ena/data/view/" datatable.datatable.set_links_to_column( "ena", [urlena + this for this in df['ena']]) datatable.datatable.set_links_to_column("links", "taxon") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'Bfrtip', "paging": "false", "order": [[2, "desc"]], 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html += "{} {}".format(html_tab, js) """# Rounding and convert in string to avoid exp notation df['percentage'] = df['percentage'].apply(lambda x: str(round(x,4))) #self.jinja['kraken_json'] = df.to_json()""" return html