def test_gridss_clove_pipeline(sorted_bam, reference_genome, outdir, threads=4, replace=False): """Tests that the default gridss and clove pipeline can be obtained""" fun.make_folder(outdir) # define the median coverage (to be recalculated) median_coverage = -1 SV_dict, df_gridss = fun.run_gridssClove_given_filters( sorted_bam, reference_genome, outdir, median_coverage, replace=replace, threads=threads, gridss_blacklisted_regions="", gridss_VCFoutput="", gridss_maxcoverage=50000, median_insert_size=250, median_insert_size_sd=25, gridss_filters_dict=fun.default_filtersDict_gridss, run_in_parallel=True, max_rel_coverage_to_consider_del=0.2, min_rel_coverage_to_consider_dup=1.8, replace_FromGridssRun=replace) print("you could run the gridss + clove pipeline succesfully")
def test_SRAdb_query_downloading_and_readTrimming(outdir, reference_genome, target_taxID, replace=False, threads=4): """This function runs get_close_shortReads_table_close_to_taxID for the MERS coronavirus and taking the lowest coverage reads. This tests that sra tools, entrez tools, trimmomatic and fastqc work well. 5476 is C. albicans 1335626 is MERS """ # make the outdir fun.make_folder(outdir) # set ploidy to 1 ploidy = 1 try: # run with 'get_lowest_coverage_possible=True', which will take the lowest coverage datasets close_shortReads_table = fun.get_close_shortReads_table_close_to_taxID( target_taxID, reference_genome, outdir, ploidy, n_close_samples=2, nruns_per_sample=1, replace=replace, threads=threads, min_fraction_reads_mapped=0.0, coverage_subset_reads=0.1, min_coverage=5, job_array_mode="local", StopAfter_sampleIndexingFromSRA=False, queue_jobs="debug", max_ncores_queue=768, time_read_obtention="02:00:00", StopAfterPrefecth_of_reads=False, get_lowest_coverage_possible=True) # check df_close_shortReads_table = fun.pd.read_csv(close_shortReads_table, sep="\t") if set(df_close_shortReads_table.keys()) != { 'short_reads2', 'short_reads1', 'runID', 'sampleID' } or len(df_close_shortReads_table) != 2: raise ValueError( "The close_shortReads_table %s was not created as expected" % close_shortReads_table) print( "The system to query the SRA database, dowload and trim reads works" ) except: print( "\n\n---\nWARNING: The connection to SRA did not work. This means that the automated obtention of reads of close species for benchmarking (involving the arguments --target_taxID, --n_close_samples, --nruns_per_sample or --goldenSet_dir) may fail. You can download the reads on your own and provide them with --close_shortReads_table. This can be also due to network problems at this moment. \n---\n\n" )
def test_bwa_mem_and_get_bam(r1, r2, ref_genome, replace=False): """Runs bwa mem on the reads and returns the sorted bam with marked duplicates""" # define the outdir outdir = "%s/aligning_reads_against_%s" % (fun.get_dir(r1), fun.get_file(ref_genome)) # if replace is True, delete the outdir if replace is True: fun.delete_folder(outdir) # make de outdir fun.make_folder(outdir) # define the inputs of bam bamfile = "%s/aligned_reads.bam" % outdir sorted_bam = "%s.sorted" % bamfile index_bam = "%s.bai" % sorted_bam name_sample = "test_sample" # run print("aligning reads") fun.run_bwa_mem(r1, r2, ref_genome, outdir, bamfile, sorted_bam, index_bam, name_sample, threads=4, replace=False, MarkDuplicates=True) return sorted_bam
def test_read_simulation_and_get_reads(genome, window_l=2000, npairs=50000, read_length=150, median_insert_size=250, median_insert_size_sd=50, threads=4, replace=False): """ Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads """ # define the outdir outdir = "%s_simulating_reads" % genome outdir_reads = "%s/getting_reads" % outdir # remove the outdirs if replace is True if replace is True: fun.delete_folder(outdir) fun.delete_folder(outdir_reads) # make folders fun.make_folder(outdir) fun.make_folder(outdir_reads) # define the expected reads reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads if any([fun.file_is_empty(f) for f in [reads1, reads2]]): # run index the genome fun.run_cmd("%s faidx %s" % (fun.samtools, genome)) # get the windows df windows_bed = "%s/windows_file.bed" % outdir fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" % (fun.bedtools, genome, window_l, windows_bed)) df_windows = fun.pd.read_csv(windows_bed, sep="\t", header=-1, names=["chromosome", "start", "end"]) df_windows["predicted_relative_coverage"] = fun.random.sample( list(fun.np.linspace(0.5, 2, 10000)), len(df_windows)) # simulate reads fun.simulate_readPairs_per_window(df_windows, genome, npairs, outdir_reads, read_length, median_insert_size, median_insert_size_sd, replace=False, threads=4) print("read simulation works well") return reads1, reads2
def test_smallVarCall_CNV_running( sorted_bam, outdir, ref_genome, gff, threads=4, mitochondrial_chromosome="mito_C_glabrata_CBS138", replace=False): """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it""" # if replace is True, remove the outdir if replace is True: fun.delete_folder(outdir) # make the outdir fun.make_folder(outdir) # get the repeats repeats_table = fun.get_repeat_maskerDF(ref_genome, threads=4, replace=False)[1] for pooled_seq in [False ]: # this may be also [False, True] to test pooled seq outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq)) # define the final file final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall if fun.file_is_empty(final_file) or replace is True: print( "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered" % pooled_seq) # define the cmd cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % ( varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam, mitochondrial_chromosome, repeats_table, gff, threads) # add pooled seq if pooled_seq is True: cmd += " --pooled_sequencing" fun.run_cmd(cmd) print("small variant calling and CNV of genes works")
def test_rearranging_genome_random( ref_genome, replace=False, threads=4, mitochondrial_chromosome="mito_C_glabrata_CBS138", nvars=5): """This function takes a reference genome and simulates random variation on it, returning the rearranged genome in fasta format""" # define the outdir outdir = "%s.testing_rearranged_genome_generation" % (ref_genome) fun.make_folder(outdir) sim_svtype_to_svfile, rearranged_genome = fun.rearrange_genomes_simulateSV( ref_genome, outdir, replace=replace, nvars=nvars, mitochondrial_chromosome=mitochondrial_chromosome) print("The generation of a genome with randomly-inserted SVs works") return rearranged_genome
def test_processing_varcalling(smallVars_input_outdir, reference_genome, outdir, sorted_bam, replace=False, threads=4): """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running""" # get full paths outdir = fun.get_fullpath(outdir) smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir) reference_genome = fun.get_fullpath(reference_genome) # cp the files under outdir fun.make_folder(outdir) target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir if not os.path.isdir(target_smallVars_input_outdir) or replace is True: fun.run_cmd( "cp -r %s %s " % (smallVars_input_outdir, target_smallVars_input_outdir_tmp)) os.rename(target_smallVars_input_outdir_tmp, target_smallVars_input_outdir) # final file final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir if fun.file_is_empty(final_file) or replace is True: cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % ( varcall_cnv_pipeline, reference_genome, target_smallVars_input_outdir, sorted_bam, reference_genome, threads) fun.run_cmd(cmd) print("you can run successfully the variant processing")
test_ref_genome = "%s/reduced_genome.fasta"%testing_inputs_dir test_mutated_genome = "%s/reduced_genome_mutated.fasta"%testing_inputs_dir test_gff = "%s/reduced_annotation.gff"%testing_inputs_dir # load the functions (test if you can import python packages) import sv_functions as fun print("loading python packages worked successfully") # define the testing inuts dir testing_outputs_dir = "%s/testing_outputs"%test_dir test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade # delete and cretae outdir #fun.delete_folder(testing_outputs_dir) fun.make_folder(testing_outputs_dir) fun.make_folder(test_output_perSVade) # redefine the reference genome location ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome)) # redefine the gff gff = "%s/reduced_annotations.gff"%testing_outputs_dir if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff)) # redefine the mutated genome location mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome)) # define an example calbicans varCall_outout
ParentDir = "/gpfs/projects/bsc40/mschikora" threads = 24 # define the dir where all perSVade code is perSVade_dir = "%s/scripts/perSVade/perSVade_repository/scripts"%ParentDir sys.path.insert(0, perSVade_dir) # import functions import sv_functions as fun # define paths perSVade_py = "%s/perSVade.py"%perSVade_dir # define dirs outdir_testing = "%s/scripts/perSVade/perSVade_repository/testing/outdirs_testing_severalSpecies"%ParentDir; fun.make_folder(outdir_testing) outdir_genomes_and_annotations = "%s/scripts/perSVade/perSVade_repository/testing/genomes_and_annotations"%ParentDir ################################ """ This is how the genomes were obtained: C. glabrata: reference genome from CGD: the latest version by 12/03/2019, which is v_s02-m07-r35 C. albicans: ref genome CGD: http://www.candidagenome.org/download/sequence/C_albicans_SC5314/Assembly22/current/C_albicans_SC5314_version_A22-s07-m01-r110_chromosomes.fasta.gz gff from CGD: http://www.candidagenome.org/download/gff/C_albicans_SC5314/Assembly22/C_albicans_SC5314_version_A22-s07-m01-r110_features.gff
"--StopAfter_smallVarCallSimpleRunning", dest="StopAfter_smallVarCallSimpleRunning", action="store_true", default=False, help="Stop after obtaining the filtered vcf outputs of each program.") # get arguments opt = parser.parse_args() ###################################################### ###################################################### ###################################################### # debug commands if opt.replace is True: fun.delete_folder(opt.outdir) fun.make_folder(opt.outdir) if not opt.gff is None and fun.file_is_empty(opt.gff): raise ValueError("%s is not a valid gff" % opt.gff) # define the minimum AF ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1} if opt.minAF_smallVars == "infer": minAF_smallVars = ploidy_to_minAF[opt.ploidy] elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0: minAF_smallVars = opt.minAF_smallVars else: raise ValueError("The value provided in --minAF_smallVars is incorrect") print("running small vars and CNV pipeline into %s" % opt.outdir) # check that the environment is correct