コード例 #1
0
def test_read_simulation_and_get_reads(genome,
                                       window_l=2000,
                                       npairs=50000,
                                       read_length=150,
                                       median_insert_size=250,
                                       median_insert_size_sd=50,
                                       threads=4,
                                       replace=False):
    """ 
    Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads 
    """

    # define the outdir
    outdir = "%s_simulating_reads" % genome
    outdir_reads = "%s/getting_reads" % outdir

    # remove the outdirs if replace is True
    if replace is True:
        fun.delete_folder(outdir)
        fun.delete_folder(outdir_reads)

    # make folders
    fun.make_folder(outdir)
    fun.make_folder(outdir_reads)

    # define the expected reads
    reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads
    reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads

    if any([fun.file_is_empty(f) for f in [reads1, reads2]]):

        # run index the genome
        fun.run_cmd("%s faidx %s" % (fun.samtools, genome))

        # get the windows df
        windows_bed = "%s/windows_file.bed" % outdir
        fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" %
                    (fun.bedtools, genome, window_l, windows_bed))
        df_windows = fun.pd.read_csv(windows_bed,
                                     sep="\t",
                                     header=-1,
                                     names=["chromosome", "start", "end"])
        df_windows["predicted_relative_coverage"] = fun.random.sample(
            list(fun.np.linspace(0.5, 2, 10000)), len(df_windows))

        # simulate reads
        fun.simulate_readPairs_per_window(df_windows,
                                          genome,
                                          npairs,
                                          outdir_reads,
                                          read_length,
                                          median_insert_size,
                                          median_insert_size_sd,
                                          replace=False,
                                          threads=4)

    print("read simulation works well")
    return reads1, reads2
コード例 #2
0
def test_conda_env_generation(outdir, replace=False):
    """This function exports the current perSVade_env to a .yml file, and generates a conda file"""

    # define the file that indicates that the enviornment is correct
    correct_env_file = "%s/correct_env.txt" % outdir

    # define a test_env_name
    test_env_name = "%s_test" % EnvName

    if fun.file_is_empty(correct_env_file) or replace is True:

        # remove previous env
        print("removing previous env")
        try:
            fun.run_cmd("conda remove -y -n %s --all" % test_env_name)
        except:
            print("%s does not exist" % test_env_name)

        # export file
        print("creating %s yml" % EnvName)
        yml_file = "%s/%s.yml" % (outdir, test_env_name)
        fun.run_cmd(
            "conda env export --no-builds --from-history -n %s --file %s" %
            (EnvName, yml_file))

        # create environment
        print("re-generating as %s" % test_env_name)
        fun.run_cmd("conda env create --file %s --name %s" %
                    (yml_file, test_env_name))

        # test that the activation works
        print("activating %s" % test_env_name)
        cmd_activation = "source %s/etc/profile.d/conda.sh && conda activate %s && python -c 'import sys; sys.path.insert(0, \"%s\"); import sv_functions as fun'" % (
            AnacondaDir, test_env_name, fun.get_fullpath(scripts_dir))
        fun.run_cmd(cmd_activation)

        # remove file
        print("removing envs")
        fun.remove_file(yml_file)

        # remove env
        fun.run_cmd("conda remove -y -n %s --all" % test_env_name)

        # create file stating that the env is correct
        open(correct_env_file, "w").write("env is correct")

    print("%s can be correctly regenerated" % EnvName)
コード例 #3
0
def test_smallVarCall_CNV_running(
        sorted_bam,
        outdir,
        ref_genome,
        gff,
        threads=4,
        mitochondrial_chromosome="mito_C_glabrata_CBS138",
        replace=False):
    """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it"""

    # if replace is True, remove the outdir
    if replace is True: fun.delete_folder(outdir)

    # make the outdir
    fun.make_folder(outdir)

    # get the repeats
    repeats_table = fun.get_repeat_maskerDF(ref_genome,
                                            threads=4,
                                            replace=False)[1]

    for pooled_seq in [False
                       ]:  # this may be also [False, True] to test pooled seq

        outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq))

        # define the final file
        final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall

        if fun.file_is_empty(final_file) or replace is True:
            print(
                "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered"
                % pooled_seq)

            # define the cmd
            cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % (
                varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam,
                mitochondrial_chromosome, repeats_table, gff, threads)

            # add pooled seq
            if pooled_seq is True: cmd += " --pooled_sequencing"

            fun.run_cmd(cmd)

    print("small variant calling and CNV of genes works")
コード例 #4
0
def test_processing_varcalling(smallVars_input_outdir,
                               reference_genome,
                               outdir,
                               sorted_bam,
                               replace=False,
                               threads=4):
    """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running"""

    # get full paths
    outdir = fun.get_fullpath(outdir)
    smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir)
    reference_genome = fun.get_fullpath(reference_genome)

    # cp the files under outdir
    fun.make_folder(outdir)

    target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir
    target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir
    if not os.path.isdir(target_smallVars_input_outdir) or replace is True:
        fun.run_cmd(
            "cp -r %s %s " %
            (smallVars_input_outdir, target_smallVars_input_outdir_tmp))
        os.rename(target_smallVars_input_outdir_tmp,
                  target_smallVars_input_outdir)

    # final file
    final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir

    if fun.file_is_empty(final_file) or replace is True:

        cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % (
            varcall_cnv_pipeline, reference_genome,
            target_smallVars_input_outdir, sorted_bam, reference_genome,
            threads)

        fun.run_cmd(cmd)

    print("you can run successfully the variant processing")
コード例 #5
0
import sv_functions as fun
print("loading python packages worked successfully")

# define the testing inuts dir 
testing_outputs_dir = "%s/testing_outputs"%test_dir
test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir
outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade

# delete and cretae outdir
#fun.delete_folder(testing_outputs_dir)
fun.make_folder(testing_outputs_dir)
fun.make_folder(test_output_perSVade)

# redefine the reference genome location
ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome))

# redefine the gff
gff = "%s/reduced_annotations.gff"%testing_outputs_dir
if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff))

# redefine the mutated genome location
mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome))

# define an example calbicans varCall_outout
Calbicans_varCall_outdir = "%s/varcalling_output_Calbicans_SRR2088862"%testing_inputs_dir

# define the Calbicans genome
inputs_Calbicans_genome = "%s/Candida_albicans.fasta"%testing_inputs_dir
Calbicans_genome = "%s/Candida_albicans.fasta"%testing_outputs_dir
コード例 #6
0
    dest="StopAfter_smallVarCallSimpleRunning",
    action="store_true",
    default=False,
    help="Stop after obtaining the filtered vcf outputs of each program.")

# get arguments
opt = parser.parse_args()

######################################################
######################################################
######################################################

# debug commands
if opt.replace is True: fun.delete_folder(opt.outdir)
fun.make_folder(opt.outdir)
if not opt.gff is None and fun.file_is_empty(opt.gff):
    raise ValueError("%s is not a valid gff" % opt.gff)

# define the minimum AF
ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1}
if opt.minAF_smallVars == "infer":
    minAF_smallVars = ploidy_to_minAF[opt.ploidy]
elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0:
    minAF_smallVars = opt.minAF_smallVars
else:
    raise ValueError("The value provided in --minAF_smallVars is incorrect")

print("running small vars and CNV pipeline into %s" % opt.outdir)

# check that the environment is correct
fun.run_cmd(
コード例 #7
0
parser.add_argument("--gDNA_code", dest="gDNA_code", required=False, default=1, type=int, help="The code of translation of ncbi of nuclear genes. Standard by default. C. albicans has 12")
opt = parser.parse_args()

# print the command line to run this


#############################################
################ RUNNING VEP ################
#############################################

# get the gff tabixed and sorted
gff_clean = "%s_clean.gff"%opt.gff
gff_clean_compressed = "%s_clean.gz"%opt.gff
gff_clean_compressed_tbi = "%s.tbi"%gff_clean_compressed

if fun.file_is_empty(gff_clean_compressed_tbi):

    # remove previous files
    fun.remove_file(gff_clean)
    fun.remove_file(gff_clean_compressed)
    fun.remove_file(gff_clean_compressed_tbi)

    print("compressing gff")

    # eliminate strange lines,chromosomes and compress
    fun.run_cmd("%s sort -i %s | egrep -v '^#' | egrep -v $'\tchromosome\t' > %s"%(bedtools, opt.gff, gff_clean))
    fun.run_cmd("%s -c %s > %s"%(bgzip, gff_clean, gff_clean_compressed))

    # index with tabix
    fun.run_cmd("%s %s"%(tabix, gff_clean_compressed))
コード例 #8
0
if opt.type_data == "illumina_paired":

    final_trimmed_reads1 = "%s/%s_trimmed_reads_1.fastq.gz" % (opt.outdir,
                                                               opt.srr)
    final_trimmed_reads2 = "%s/%s_trimmed_reads_2.fastq.gz" % (opt.outdir,
                                                               opt.srr)

    final_files = [final_trimmed_reads1, final_trimmed_reads2]

elif opt.type_data == "nanopore":

    final_trimmed_reads = "%s/%s_trimmed_reads.fastq.gz" % (opt.outdir,
                                                            opt.srr)
    final_files = [final_trimmed_reads]

if any([fun.file_is_empty(f) for f in final_files]) or opt.replace is True:

    # make outdir
    fun.make_folder(opt.outdir)

    # run prefetch
    print("running prefetch")
    SRRfile = "%s/%s.srr" % (opt.outdir, opt.srr)
    SRRfile = fun.download_srr_with_prefetch(opt.srr,
                                             SRRfile,
                                             replace=opt.replace)

    # stop after prefetch
    if opt.stop_after_prefetch is True:
        print("Exiting after prefetch obtention")
        sys.exit(0)