Ejemplo n.º 1
0
def test_gridss_clove_pipeline(sorted_bam,
                               reference_genome,
                               outdir,
                               threads=4,
                               replace=False):
    """Tests that the default gridss and clove pipeline can be obtained"""

    fun.make_folder(outdir)

    # define the median coverage (to be recalculated)
    median_coverage = -1

    SV_dict, df_gridss = fun.run_gridssClove_given_filters(
        sorted_bam,
        reference_genome,
        outdir,
        median_coverage,
        replace=replace,
        threads=threads,
        gridss_blacklisted_regions="",
        gridss_VCFoutput="",
        gridss_maxcoverage=50000,
        median_insert_size=250,
        median_insert_size_sd=25,
        gridss_filters_dict=fun.default_filtersDict_gridss,
        run_in_parallel=True,
        max_rel_coverage_to_consider_del=0.2,
        min_rel_coverage_to_consider_dup=1.8,
        replace_FromGridssRun=replace)

    print("you could run the gridss + clove pipeline succesfully")
Ejemplo n.º 2
0
def test_SRAdb_query_downloading_and_readTrimming(outdir,
                                                  reference_genome,
                                                  target_taxID,
                                                  replace=False,
                                                  threads=4):
    """This function runs get_close_shortReads_table_close_to_taxID for the MERS coronavirus and taking the lowest coverage reads. This tests that sra tools, entrez tools, trimmomatic and fastqc work well.

    5476 is C. albicans
    1335626 is MERS
    """

    # make the outdir
    fun.make_folder(outdir)

    # set ploidy to 1
    ploidy = 1

    try:

        # run with 'get_lowest_coverage_possible=True', which will take the lowest coverage datasets
        close_shortReads_table = fun.get_close_shortReads_table_close_to_taxID(
            target_taxID,
            reference_genome,
            outdir,
            ploidy,
            n_close_samples=2,
            nruns_per_sample=1,
            replace=replace,
            threads=threads,
            min_fraction_reads_mapped=0.0,
            coverage_subset_reads=0.1,
            min_coverage=5,
            job_array_mode="local",
            StopAfter_sampleIndexingFromSRA=False,
            queue_jobs="debug",
            max_ncores_queue=768,
            time_read_obtention="02:00:00",
            StopAfterPrefecth_of_reads=False,
            get_lowest_coverage_possible=True)

        # check
        df_close_shortReads_table = fun.pd.read_csv(close_shortReads_table,
                                                    sep="\t")

        if set(df_close_shortReads_table.keys()) != {
                'short_reads2', 'short_reads1', 'runID', 'sampleID'
        } or len(df_close_shortReads_table) != 2:
            raise ValueError(
                "The close_shortReads_table %s was not created as expected" %
                close_shortReads_table)

        print(
            "The system to query the SRA database, dowload and trim reads works"
        )

    except:

        print(
            "\n\n---\nWARNING: The connection to SRA did not work. This means that the automated obtention of reads of close species for benchmarking (involving the arguments --target_taxID, --n_close_samples, --nruns_per_sample or --goldenSet_dir) may fail. You can download the reads on your own and provide them with --close_shortReads_table. This can be also due to network problems at this moment. \n---\n\n"
        )
Ejemplo n.º 3
0
def test_bwa_mem_and_get_bam(r1, r2, ref_genome, replace=False):
    """Runs bwa mem on the reads and returns the sorted bam with marked duplicates"""

    # define the outdir
    outdir = "%s/aligning_reads_against_%s" % (fun.get_dir(r1),
                                               fun.get_file(ref_genome))

    # if replace is True, delete the outdir
    if replace is True: fun.delete_folder(outdir)

    # make de outdir
    fun.make_folder(outdir)

    # define the inputs of bam
    bamfile = "%s/aligned_reads.bam" % outdir
    sorted_bam = "%s.sorted" % bamfile
    index_bam = "%s.bai" % sorted_bam
    name_sample = "test_sample"

    # run
    print("aligning reads")
    fun.run_bwa_mem(r1,
                    r2,
                    ref_genome,
                    outdir,
                    bamfile,
                    sorted_bam,
                    index_bam,
                    name_sample,
                    threads=4,
                    replace=False,
                    MarkDuplicates=True)

    return sorted_bam
Ejemplo n.º 4
0
def test_read_simulation_and_get_reads(genome,
                                       window_l=2000,
                                       npairs=50000,
                                       read_length=150,
                                       median_insert_size=250,
                                       median_insert_size_sd=50,
                                       threads=4,
                                       replace=False):
    """ 
    Takes a genome and simulates reads for it, saving them under <genome>_simulating_reads 
    """

    # define the outdir
    outdir = "%s_simulating_reads" % genome
    outdir_reads = "%s/getting_reads" % outdir

    # remove the outdirs if replace is True
    if replace is True:
        fun.delete_folder(outdir)
        fun.delete_folder(outdir_reads)

    # make folders
    fun.make_folder(outdir)
    fun.make_folder(outdir_reads)

    # define the expected reads
    reads1 = "%s/all_reads1.correct.fq.gz" % outdir_reads
    reads2 = "%s/all_reads2.correct.fq.gz" % outdir_reads

    if any([fun.file_is_empty(f) for f in [reads1, reads2]]):

        # run index the genome
        fun.run_cmd("%s faidx %s" % (fun.samtools, genome))

        # get the windows df
        windows_bed = "%s/windows_file.bed" % outdir
        fun.run_cmd("%s makewindows -g %s.fai -w %i > %s" %
                    (fun.bedtools, genome, window_l, windows_bed))
        df_windows = fun.pd.read_csv(windows_bed,
                                     sep="\t",
                                     header=-1,
                                     names=["chromosome", "start", "end"])
        df_windows["predicted_relative_coverage"] = fun.random.sample(
            list(fun.np.linspace(0.5, 2, 10000)), len(df_windows))

        # simulate reads
        fun.simulate_readPairs_per_window(df_windows,
                                          genome,
                                          npairs,
                                          outdir_reads,
                                          read_length,
                                          median_insert_size,
                                          median_insert_size_sd,
                                          replace=False,
                                          threads=4)

    print("read simulation works well")
    return reads1, reads2
Ejemplo n.º 5
0
def test_smallVarCall_CNV_running(
        sorted_bam,
        outdir,
        ref_genome,
        gff,
        threads=4,
        mitochondrial_chromosome="mito_C_glabrata_CBS138",
        replace=False):
    """Takes a sorted bam (shuld have some mutations) and runs the variant calling pipeline on it"""

    # if replace is True, remove the outdir
    if replace is True: fun.delete_folder(outdir)

    # make the outdir
    fun.make_folder(outdir)

    # get the repeats
    repeats_table = fun.get_repeat_maskerDF(ref_genome,
                                            threads=4,
                                            replace=False)[1]

    for pooled_seq in [False
                       ]:  # this may be also [False, True] to test pooled seq

        outdir_varCall = "%s/varcall_pooledSeq_%s" % (outdir, str(pooled_seq))

        # define the final file
        final_file = "%s/variant_annotation_ploidy2.tab" % outdir_varCall

        if fun.file_is_empty(final_file) or replace is True:
            print(
                "running on pooled_seq=%s. This may take a bit because a lot of variants will be considered"
                % pooled_seq)

            # define the cmd
            cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr %s -mcode 3 -gcode 1 --repeats_table %s --remove_smallVarsCNV_nonEssentialFiles -gff %s -thr %i" % (
                varcall_cnv_pipeline, ref_genome, outdir_varCall, sorted_bam,
                mitochondrial_chromosome, repeats_table, gff, threads)

            # add pooled seq
            if pooled_seq is True: cmd += " --pooled_sequencing"

            fun.run_cmd(cmd)

    print("small variant calling and CNV of genes works")
Ejemplo n.º 6
0
def test_rearranging_genome_random(
        ref_genome,
        replace=False,
        threads=4,
        mitochondrial_chromosome="mito_C_glabrata_CBS138",
        nvars=5):
    """This function takes a reference genome and simulates random variation on it, returning the rearranged genome in fasta format"""

    # define the outdir
    outdir = "%s.testing_rearranged_genome_generation" % (ref_genome)
    fun.make_folder(outdir)

    sim_svtype_to_svfile, rearranged_genome = fun.rearrange_genomes_simulateSV(
        ref_genome,
        outdir,
        replace=replace,
        nvars=nvars,
        mitochondrial_chromosome=mitochondrial_chromosome)

    print("The generation of a genome with randomly-inserted SVs works")

    return rearranged_genome
Ejemplo n.º 7
0
def test_processing_varcalling(smallVars_input_outdir,
                               reference_genome,
                               outdir,
                               sorted_bam,
                               replace=False,
                               threads=4):
    """This function takes a varcall file were all the variant calling has been performed and checks that the processing of vcfs works in varcall_cnv_pipeline. sorted_bam is just a fake sorted bam not to repeat the pipeline running"""

    # get full paths
    outdir = fun.get_fullpath(outdir)
    smallVars_input_outdir = fun.get_fullpath(smallVars_input_outdir)
    reference_genome = fun.get_fullpath(reference_genome)

    # cp the files under outdir
    fun.make_folder(outdir)

    target_smallVars_input_outdir = "%s/smallVars_input_outdir" % outdir
    target_smallVars_input_outdir_tmp = "%s.tmp" % target_smallVars_input_outdir
    if not os.path.isdir(target_smallVars_input_outdir) or replace is True:
        fun.run_cmd(
            "cp -r %s %s " %
            (smallVars_input_outdir, target_smallVars_input_outdir_tmp))
        os.rename(target_smallVars_input_outdir_tmp,
                  target_smallVars_input_outdir)

    # final file
    final_file = "%s/variants_atLeast3PASS_ploidy2.vcf" % target_smallVars_input_outdir

    if fun.file_is_empty(final_file) or replace is True:

        cmd = "%s -r %s -o %s -p 2 -sbam %s -caller all -c 5 -mchr no_mitochondria -mcode 3 -gcode 1 --repeats_table %s.repeats.tab --remove_smallVarsCNV_nonEssentialFiles -thr %i --skip_cnv_analysis" % (
            varcall_cnv_pipeline, reference_genome,
            target_smallVars_input_outdir, sorted_bam, reference_genome,
            threads)

        fun.run_cmd(cmd)

    print("you can run successfully the variant processing")
Ejemplo n.º 8
0
test_ref_genome = "%s/reduced_genome.fasta"%testing_inputs_dir
test_mutated_genome = "%s/reduced_genome_mutated.fasta"%testing_inputs_dir
test_gff = "%s/reduced_annotation.gff"%testing_inputs_dir

# load the functions (test if you can import python packages)
import sv_functions as fun
print("loading python packages worked successfully")

# define the testing inuts dir 
testing_outputs_dir = "%s/testing_outputs"%test_dir
test_output_perSVade = "%s/perSVade_output"%testing_outputs_dir
outdir_small_variantCalling = "%s/smallVars_CNV_output"%test_output_perSVade

# delete and cretae outdir
#fun.delete_folder(testing_outputs_dir)
fun.make_folder(testing_outputs_dir)
fun.make_folder(test_output_perSVade)

# redefine the reference genome location
ref_genome = "%s/reduced_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(ref_genome): fun.run_cmd("cp %s %s"%(test_ref_genome, ref_genome))

# redefine the gff
gff = "%s/reduced_annotations.gff"%testing_outputs_dir
if fun.file_is_empty(gff): fun.run_cmd("cp %s %s"%(test_gff, gff))

# redefine the mutated genome location
mut_genome = "%s/mutated_genome.fasta"%testing_outputs_dir
if fun.file_is_empty(mut_genome): fun.run_cmd("cp %s %s"%(test_mutated_genome, mut_genome))

# define an example calbicans varCall_outout
    ParentDir = "/gpfs/projects/bsc40/mschikora"
    threads = 24


# define the dir where all perSVade code is
perSVade_dir = "%s/scripts/perSVade/perSVade_repository/scripts"%ParentDir
sys.path.insert(0, perSVade_dir)

# import functions
import sv_functions as fun

# define paths
perSVade_py = "%s/perSVade.py"%perSVade_dir

# define dirs
outdir_testing = "%s/scripts/perSVade/perSVade_repository/testing/outdirs_testing_severalSpecies"%ParentDir; fun.make_folder(outdir_testing)
outdir_genomes_and_annotations = "%s/scripts/perSVade/perSVade_repository/testing/genomes_and_annotations"%ParentDir

################################


"""
This is how the genomes were obtained:

C. glabrata: reference genome from CGD: the latest version by 12/03/2019, which is v_s02-m07-r35 

C. albicans: 

    ref genome CGD: http://www.candidagenome.org/download/sequence/C_albicans_SC5314/Assembly22/current/C_albicans_SC5314_version_A22-s07-m01-r110_chromosomes.fasta.gz

    gff from CGD: http://www.candidagenome.org/download/gff/C_albicans_SC5314/Assembly22/C_albicans_SC5314_version_A22-s07-m01-r110_features.gff
Ejemplo n.º 10
0
    "--StopAfter_smallVarCallSimpleRunning",
    dest="StopAfter_smallVarCallSimpleRunning",
    action="store_true",
    default=False,
    help="Stop after obtaining the filtered vcf outputs of each program.")

# get arguments
opt = parser.parse_args()

######################################################
######################################################
######################################################

# debug commands
if opt.replace is True: fun.delete_folder(opt.outdir)
fun.make_folder(opt.outdir)
if not opt.gff is None and fun.file_is_empty(opt.gff):
    raise ValueError("%s is not a valid gff" % opt.gff)

# define the minimum AF
ploidy_to_minAF = {1: 0.9, 2: 0.25, 3: 0.15, 4: 0.1}
if opt.minAF_smallVars == "infer":
    minAF_smallVars = ploidy_to_minAF[opt.ploidy]
elif opt.minAF_smallVars <= 1 and opt.minAF_smallVars >= 0:
    minAF_smallVars = opt.minAF_smallVars
else:
    raise ValueError("The value provided in --minAF_smallVars is incorrect")

print("running small vars and CNV pipeline into %s" % opt.outdir)

# check that the environment is correct