def test_read_simulation_and_get_reads(genome, window_l=2000, npairs=50000, read_length=150, median_insert_size=250, median_insert_size_sd=50, threads=4, replace=False): """ Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads """ # define the outdir outdir = "%s_simulating_reads" % genome outdir_reads = "%s/getting_reads" % outdir # remove the outdirs if replace is True if replace is True: fun.delete_folder(outdir) fun.delete_folder(outdir_reads) # make folders fun.make_folder(outdir) fun.make_folder(outdir_reads) # define the expected reads reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads if any([fun.file_is_empty(f) for f in [reads1, reads2]]): # run index the genome fun.run_cmd("%s faidx %s" % (fun.samtools, genome)) # get the windows df windows_bed = "%s/windows_file.bed" % outdir fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" % (fun.bedtools, genome, window_l, windows_bed)) df_windows = fun.pd.read_csv(windows_bed, sep="\t", header=-1, names=["chromosome", "start", "end"]) df_windows["predicted_relative_coverage"] = fun.random.sample( list(fun.np.linspace(0.5, 2, 10000)), len(df_windows)) # simulate reads fun.simulate_readPairs_per_window(df_windows, genome, npairs, outdir_reads, read_length, median_insert_size, median_insert_size_sd, replace=False, threads=4) print("read simulation works well") return reads1, reads2
def test_conda_env_generation(outdir, replace=False): """This function exports the current perSVade_env to a .yml file, and generates a conda file""" # define the file that indicates that the enviornment is correct correct_env_file = "%s/correct_env.txt" % outdir # define a test_env_name test_env_name = "%s_test" % EnvName if fun.file_is_empty(correct_env_file) or replace is True: # remove previous env print("removing previous env") try: fun.run_cmd("conda remove -y -n %s --all" % test_env_name) except: print("%s does not exist" % test_env_name) # export file print("creating %s yml" % EnvName) yml_file = "%s/%s.yml" % (outdir, test_env_name) fun.run_cmd( "conda env export --no-builds --from-history -n %s --file %s" % (EnvName, yml_file)) # create environment print("re-generating as %s" % test_env_name) fun.run_cmd("conda env create --file %s --name %s" % (yml_file, test_env_name)) # test that the activation works print("activating %s" % test_env_name) cmd_activation = "source %s/etc/profile.d/conda.sh && conda activate %s && python -c 'import sys; sys.path.insert(0, \"%s\"); import sv_functions as fun'" % ( AnacondaDir, test_env_name, fun.get_fullpath(scripts_dir)) fun.run_cmd(cmd_activation) # remove file print("removing envs") fun.remove_file(yml_file) # remove env fun.run_cmd("conda remove -y -n %s --all" % test_env_name) # create file stating that the env is correct open(correct_env_file, "w").write("env is correct") print("%s can be correctly regenerated" % EnvName)
def test_smallVarCall_CNV_running( sorted_bam, outdir, ref_genome, gff, threads=4, mitochondrial_chromosome="mito_C_glabrata_CBS138", replace=False): """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it""" # if replace is True, remove the outdir if replace is True: fun.delete_folder(outdir) # make the outdir fun.make_folder(outdir) # get the repeats repeats_table = fun.get_repeat_maskerDF(ref_genome, threads=4, replace=False)[1] for pooled_seq in [False ]: # this may be also [False, True] to test pooled seq outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq)) # define the final file final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall if fun.file_is_empty(final_file) or replace is True: print( "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered" % pooled_seq) # define the cmd cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % ( varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam, mitochondrial_chromosome, repeats_table, gff, threads) # add pooled seq if pooled_seq is True: cmd += " --pooled_sequencing" fun.run_cmd(cmd) print("small variant calling and CNV of genes works")
def test_processing_varcalling(smallVars_input_outdir, reference_genome, outdir, sorted_bam, replace=False, threads=4): """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running""" # get full paths outdir = fun.get_fullpath(outdir) smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir) reference_genome = fun.get_fullpath(reference_genome) # cp the files under outdir fun.make_folder(outdir) target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir if not os.path.isdir(target_smallVars_input_outdir) or replace is True: fun.run_cmd( "cp -r %s %s " % (smallVars_input_outdir, target_smallVars_input_outdir_tmp)) os.rename(target_smallVars_input_outdir_tmp, target_smallVars_input_outdir) # final file final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir if fun.file_is_empty(final_file) or replace is True: cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % ( varcall_cnv_pipeline, reference_genome, target_smallVars_input_outdir, sorted_bam, reference_genome, threads) fun.run_cmd(cmd) print("you can run successfully the variant processing")
import sv_functions as fun print("loading python packages worked successfully") # define the testing inuts dir testing_outputs_dir = "%s/testing_outputs"%test_dir test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade # delete and cretae outdir #fun.delete_folder(testing_outputs_dir) fun.make_folder(testing_outputs_dir) fun.make_folder(test_output_perSVade) # redefine the reference genome location ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome)) # redefine the gff gff = "%s/reduced_annotations.gff"%testing_outputs_dir if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff)) # redefine the mutated genome location mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome)) # define an example calbicans varCall_outout Calbicans_varCall_outdir = "%s/varcalling_output_Calbicans_SRR2088862"%testing_inputs_dir # define the Calbicans genome inputs_Calbicans_genome = "%s/Candida_albicans.fasta"%testing_inputs_dir Calbicans_genome = "%s/Candida_albicans.fasta"%testing_outputs_dir
dest="StopAfter_smallVarCallSimpleRunning", action="store_true", default=False, help="Stop after obtaining the filtered vcf outputs of each program.") # get arguments opt = parser.parse_args() ###################################################### ###################################################### ###################################################### # debug commands if opt.replace is True: fun.delete_folder(opt.outdir) fun.make_folder(opt.outdir) if not opt.gff is None and fun.file_is_empty(opt.gff): raise ValueError("%s is not a valid gff" % opt.gff) # define the minimum AF ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1} if opt.minAF_smallVars == "infer": minAF_smallVars = ploidy_to_minAF[opt.ploidy] elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0: minAF_smallVars = opt.minAF_smallVars else: raise ValueError("The value provided in --minAF_smallVars is incorrect") print("running small vars and CNV pipeline into %s" % opt.outdir) # check that the environment is correct fun.run_cmd(
parser.add_argument("--gDNA_code", dest="gDNA_code", required=False, default=1, type=int, help="The code of translation of ncbi of nuclear genes. Standard by default. C. albicans has 12") opt = parser.parse_args() # print the command line to run this ############################################# ################ RUNNING VEP ################ ############################################# # get the gff tabixed and sorted gff_clean = "%s_clean.gff"%opt.gff gff_clean_compressed = "%s_clean.gz"%opt.gff gff_clean_compressed_tbi = "%s.tbi"%gff_clean_compressed if fun.file_is_empty(gff_clean_compressed_tbi): # remove previous files fun.remove_file(gff_clean) fun.remove_file(gff_clean_compressed) fun.remove_file(gff_clean_compressed_tbi) print("compressing gff") # eliminate strange lines,chromosomes and compress fun.run_cmd("%s sort -i %s | egrep -v '^#' | egrep -v $'\tchromosome\t' > %s"%(bedtools, opt.gff, gff_clean)) fun.run_cmd("%s -c %s > %s"%(bgzip, gff_clean, gff_clean_compressed)) # index with tabix fun.run_cmd("%s %s"%(tabix, gff_clean_compressed))
if opt.type_data == "illumina_paired": final_trimmed_reads1 = "%s/%s_trimmed_reads_1.fastq.gz" % (opt.outdir, opt.srr) final_trimmed_reads2 = "%s/%s_trimmed_reads_2.fastq.gz" % (opt.outdir, opt.srr) final_files = [final_trimmed_reads1, final_trimmed_reads2] elif opt.type_data == "nanopore": final_trimmed_reads = "%s/%s_trimmed_reads.fastq.gz" % (opt.outdir, opt.srr) final_files = [final_trimmed_reads] if any([fun.file_is_empty(f) for f in final_files]) or opt.replace is True: # make outdir fun.make_folder(opt.outdir) # run prefetch print("running prefetch") SRRfile = "%s/%s.srr" % (opt.outdir, opt.srr) SRRfile = fun.download_srr_with_prefetch(opt.srr, SRRfile, replace=opt.replace) # stop after prefetch if opt.stop_after_prefetch is True: print("Exiting after prefetch obtention") sys.exit(0)