Exemple #1
0
def main():
    mccutils.log("processing", "making coverage fasta")
    fastas = []
    try:
        length = 80
        if snakemake.params.coverage_fasta == "None":
            mccutils.run_command(["touch", snakemake.output.coverage_fasta])
        else:
            fasta3 = snakemake.params.coverage_fasta
            fastas.append(fasta3)
            lines = fix_fasta.fix_fasta_lines(fasta3, length)
            write_fasta(lines, snakemake.output.coverage_fasta)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "ERROR...failed to create coverage fasta, check the formatting of :",
            snakemake.params.coverage_fasta,
            file=sys.stderr)
        mccutils.remove(snakemake.output[0])
        mccutils.remove(snakemake.output[1])
        mccutils.remove(snakemake.output[2])
        sys.exit(1)

    mccutils.log("processing", "coverage fasta created")
Exemple #2
0
def main():
    mccutils.log("popoolationte2", "processing PopoolationTE2 results")
    te_predictions = snakemake.input.popoolationte2_out
    te_gff = snakemake.input.te_gff
    taxonomy = snakemake.input.taxonomy
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    log = snakemake.params.log

    ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes)
    insertions = read_insertions(
        te_predictions,
        ref_tes,
        chromosomes,
        sample_name,
        both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT,
        support_threshold=config.FREQUENCY_THRESHOLD)
    if len(insertions) >= 1:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed"
        ])

    mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
Exemple #3
0
def run_retroseq(bam,
                 bed_locations,
                 ref_fasta,
                 script_dir,
                 sample_name,
                 out_dir,
                 params,
                 log=None):
    discovery_out = out_dir + "/" + sample_name + ".discovery"
    command = [
        "perl", script_dir + "/retroseq.pl", "-discover", "-bam", bam,
        "-refTEs", bed_locations, "-output", discovery_out, "-depth",
        str(params["depth"]), "-reads",
        str(params['reads']), "-q",
        str(params['q'])
    ]
    mccutils.run_command(command, log=log)

    call_out = out_dir + "/" + sample_name + ".call"
    command = [
        "perl", script_dir + "/retroseq.pl", "-call", "-bam", bam, "-input",
        discovery_out, "-filter", bed_locations, "-ref", ref_fasta, "-output",
        call_out, "-orientate", "yes", "-depth",
        str(params["depth"]), "-reads",
        str(params['reads']), "-q",
        str(params['q'])
    ]
    mccutils.run_command(command, log=log)
Exemple #4
0
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0):
    unsorted_bed = out_dir+"/unsorted.bed"
    with open(unsorted_bed, "w") as outbed:
        with open(bed,"r") as inbed:
            insertion_count = 0
            for x,line in enumerate(inbed):
                line = line.replace(";","\t")
                split_line = line.split("\t")
                if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes:
                    insertion_count += 1
                    outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]])
                    outbed.write(outline+"\n")
    
    if insertion_count >= 1:
        sorted_bed = out_dir+"/sorted.bed"
        command = ["bedtools", "sort", "-i", unsorted_bed]
        mccutils.run_command_stdout(command, sorted_bed, log=log)

        final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"
        with open(final_bed,"w") as outbed:
            header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n'
            outbed.write(header)
            with open(sorted_bed, "r") as inbed:
                for line in inbed:
                    # line = line.replace("NA",".")
                    outbed.write(line)
        mccutils.remove(sorted_bed)
    else:
        mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"])
        
    mccutils.remove(unsorted_bed)
def sort_bam(bam, sorted_bam, threads=1, log=None):
    mccutils.log("popoolationte2", "sorting BAM", log=log)
    mccutils.run_command(
        ["samtools", "sort", "-@",
         str(threads), bam, "-o", sorted_bam],
        log=log)
    return sorted_bam
def main():
    mccutils.log("retroseq", "processing RetroSeq results")
    retroseq_out = snakemake.input.retroseq_out
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    ref_name = snakemake.params.ref_name
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        retroseq_out,
        sample_name,
        chromosomes,
        support_threshold=config.READ_SUPPORT_THRESHOLD,
        breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD)
    if len(insertions) >= 1:
        insertions = output.make_redundant_bed(insertions,
                                               sample_name,
                                               out_dir,
                                               method="retroseq")
        insertions = output.make_nonredundant_bed(insertions,
                                                  sample_name,
                                                  out_dir,
                                                  method="retroseq")
        output.write_vcf(insertions, reference_fasta, sample_name, "retroseq",
                         out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_retroseq_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_retroseq_nonredundant.bed"
        ])
    mccutils.log("retroseq", "RetroSeq post processing complete")
Exemple #7
0
def main():
    reference = snakemake.input.ref
    augment = snakemake.params.augment
    mcc_out = snakemake.params.mcc_out
    run_id = snakemake.params.run_id
    log = snakemake.params.log
    out_ref = snakemake.output.ref
    out_aug_ref = snakemake.output.aug_ref

    if not os.path.exists(mcc_out+"/tmp"):
        mccutils.mkdir(mcc_out+"/tmp")

    mccutils.log("processing","making reference fasta")

    tmp = mcc_out+"/tmp/"+str(run_id)+"reference.tmp"
    reference = fix_fasta_lines(reference, tmp)
    reference = mccutils.replace_special_chars_fasta(reference, tmp+"1")
    augmented_reference = reference
    if augment != "None":
        augment = fix_fasta_lines(augment, tmp+"2")
        augment = mccutils.replace_special_chars_fasta(augment, tmp+"3")
        augmented_reference = augment_reference(reference, augment, tmp+"4")
    
    mccutils.run_command(["cp", reference, out_ref])
    mccutils.run_command(["cp", augmented_reference, out_aug_ref])

    mccutils.log("processing","reference fasta created")
Exemple #8
0
def main():
    relocate_gff = snakemake.input.relocate_gff
    te_gff = snakemake.input.te_gff

    out_dir = snakemake.params.out_dir
    log = snakemake.params.log
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    mccutils.log("relocate", "processing RelocaTE results")

    insertions = get_insertions(
        relocate_gff,
        sample_name,
        chromosomes,
        ref_l_threshold=config.REF_LEFT_THRESHOLD,
        ref_r_threshold=config.REF_RIGHT_THRESHOLD,
        nonref_l_threshold=config.NONREF_LEFT_THRESHOLD,
        nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD)

    insertions = set_ref_orientations(insertions, te_gff)

    if len(insertions) >= 1:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_relocate_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_relocate_nonredundant.bed"
        ])

    mccutils.log("relocate", "RelocaTE postprocessing complete")
Exemple #9
0
def run_retroseq(bam, bed_locations, ref_fasta, script_dir, sample_name, out_dir, params, log=None):
    discovery_out = out_dir+"/"+sample_name+".discovery"
    command = ["perl", script_dir+"/retroseq.pl", 
                    "-discover",  
                    "-bam", bam, 
                    "-refTEs", bed_locations, 
                    "-output", discovery_out]
    
    for param in params.keys():
        command.append(param)
        command.append(str(params[param]))
    mccutils.run_command(command, log=log)

    call_out  = out_dir+"/"+sample_name+".call"
    command = ["perl", script_dir+"/retroseq.pl", 
                    "-call", 
                    "-bam", bam, 
                    "-input", discovery_out, 
                    "-filter", bed_locations, 
                    "-ref", ref_fasta, 
                    "-output", call_out, 
                    "-orientate", "yes"]

    for param in params.keys():
        command.append(param)
        command.append(str(params[param]))

    mccutils.run_command(command, log=log)
Exemple #10
0
def map_reads(fq1,
              fq2,
              ref_name,
              median_insert_size,
              out,
              threads=1,
              paired=True,
              log=None):
    os.chdir(out)
    if paired:
        command = [
            "tepid-map", "-x", out + "/" + ref_name, "-y",
            out + "/" + ref_name + ".X15_01_65525S", "-p",
            str(threads), "-s", median_insert_size, "-n", ref_name, "-1", fq1,
            "-2", fq2
        ]
    else:
        command = [
            "tepid-map-se", "-x", out + "/" + ref_name, "-y",
            out + "/" + ref_name + ".X15_01_65525S", "-p",
            str(threads), "-n", ref_name, "-q", fq1
        ]

    mccutils.run_command(command, log=log)

    bam = out + "/" + ref_name + ".bam"
    split_bam = out + "/" + ref_name + ".split.bam"
    mccutils.check_file_exists(bam)
    mccutils.check_file_exists(split_bam)

    return bam, split_bam
Exemple #11
0
def run_repeatmasker(reference, ref_name, te_seqs, threads, log, outfile,
                     outdir):
    tmp_dir = outdir + "/tmp/repeatmasker"
    mccutils.remove(tmp_dir)
    mccutils.mkdir(tmp_dir)
    os.chdir(tmp_dir)

    command = [
        "RepeatMasker", "-pa",
        str(threads), "-lib", te_seqs, "-dir", tmp_dir, "-s", "-nolow",
        "-no_is", reference
    ]
    mccutils.run_command(command, log=log)

    os.chdir(outdir)

    rm_out = ""
    for f in os.listdir(tmp_dir):
        if "fasta.out" in f and f[-9:] == "fasta.out":
            rm_out = tmp_dir + "/" + f

    if rm_out == "":
        sys.exit("can't find Repeatmasker output in:" + tmp_dir + "\n")

    mccutils.run_command(["mv", rm_out, outfile])
def main():
    mccutils.log("popoolationte", "processing PopoolationTE results")
    popoolationte_out = snakemake.input.popoolationte_out

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        popoolationte_out,
        sample_name,
        chromosomes,
        require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT,
        percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD)
    if len(insertions) >= 1:
        insertions = mccutils.make_redundant_bed(insertions,
                                                 sample_name,
                                                 out_dir,
                                                 method="popoolationte")
        mccutils.make_nonredundant_bed(insertions,
                                       sample_name,
                                       out_dir,
                                       method="popoolationte")
    else:
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte_nonredundant.bed"
        ])
    mccutils.log("popoolationte", "PopoolationTE postprocessing complete")
Exemple #13
0
def main():
    mccutils.log("te-locate", "processing TE-Locate results")
    telocate_raw = snakemake.input.telocate_raw
    te_gff = snakemake.input.te_gff

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        telocate_raw,
        sample_name,
        chromosomes,
        rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD)
    insertions = filter_by_reference(insertions, te_gff)
    if len(insertions) > 0:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_telocate_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_telocate_nonredundant.bed"
        ])
    mccutils.log("te-locate", "TE-Locate post processing complete")
Exemple #14
0
def main():
    install_path = snakemake.config['paths']['install'] + "/tools/"

    mccutils.remove(snakemake.params.zipfile)
    download_success = mccutils.download(snakemake.params.url,
                                         snakemake.params.zipfile,
                                         md5=snakemake.params.md5,
                                         max_attempts=3)

    if not download_success:
        print("teflon download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)

    mccutils.remove(snakemake.config['paths']['install'] +
                    "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46")
    command = ["unzip", snakemake.params.zipfile]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46")
    command = [
        "mv", snakemake.config['paths']['install'] +
        "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46", install_path
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + "teflon")
    mccutils.mkdir(install_path + "teflon")

    for f in os.listdir(install_path +
                        "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46"):
        command = [
            "mv", install_path +
            "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46/" + f,
            install_path + "teflon"
        ]
        mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.pseudo2refConvert_patch,
        install_path + "teflon/teflon_scripts/pseudo2refConvert.py"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.teflon_patch,
        install_path + "teflon/teflon.v0.4.py"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46")
    mccutils.remove(snakemake.params.zipfile)

    # write version to file
    with open(
            snakemake.config['paths']['install'] + "/tools/teflon/version.log",
            "w") as version:
        version.write(snakemake.params.md5)
Exemple #15
0
def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, flags=[]):
    mccutils.mkdir(out+"/results/")
    command = ['trim_galore'] + flags + ["-j", str(cores), "-o", out+"/results/trimgalore"]
    if fq2 is None:
        command.append(fq1)
    else:
        command += [fq1, fq2]
    
    mccutils.run_command(command, log=log)

    if fq2 is None:
        outfq = ""
        for f in os.listdir(out+"/results/trimgalore"):
            if "_trimmed.fq" in f:
                outfq = out+"/results/trimgalore/"+f

        file_exists = mccutils.check_file_exists(outfq)
        return outfq

    else:
        outfq1 = ""
        outfq2 = ""
        for f in os.listdir(out+"/results/trimgalore"):
            if "_val_1.fq" in f:
                outfq1 = out+"/results/trimgalore/"+f
            elif "_val_2.fq" in f:
                outfq2= out+"/results/trimgalore/"+f

        file_exists = mccutils.check_file_exists(outfq1)
        file_exists = mccutils.check_file_exists(outfq2)
        return outfq1, outfq2
def combine_alignments(sam1, sam2, fq1, fq2, script_path, out, log=None):
    out_sam = out + "combined.sam"
    command = [
        "perl", script_path + "samro.pl", "--sam1", sam1, "--sam2", sam2,
        "--fq1", fq1, "--fq2", fq2, "--output", out_sam
    ]
    mccutils.run_command(command, log=log)
    return out_sam
Exemple #17
0
def main():
    log = snakemake.params.log
    mccutils.log("processing",
                 "creating 2bit file from reference genome fasta",
                 log=log)
    command = ["faToTwoBit", snakemake.input[0], snakemake.output[0]]
    mccutils.run_command(command, log=log)
    mccutils.log("processing", "reference 2bit file created")
def main():
    mccutils.log("popoolationte2", "setting up for PopoolationTE2")
    ref_fasta = snakemake.input.ref_fasta
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    jar = snakemake.params.jar
    log = snakemake.params.log
    out_dir = snakemake.params.out_dir
    threads = snakemake.threads
    status_log = snakemake.params.status_log

    try:
        # ensures intermediate files from previous runs are removed
        for f in os.listdir(out_dir):
            mccutils.remove(out_dir + "/" + f)

        mccutils.mkdir(out_dir + "/tmp")
        index_fasta(ref_fasta, log=log)
        fq1 = format_fastq(fq1, out_dir + "/reads_1.fastq", log=log)
        fq2 = format_fastq(fq2, out_dir + "/reads_2.fastq", log=log)
        sam1 = map_reads(ref_fasta,
                         fq1,
                         out_dir + "/mapped_1.sam",
                         threads=threads,
                         log=log)
        sam2 = map_reads(ref_fasta,
                         fq2,
                         out_dir + "/mapped_2.sam",
                         threads=threads,
                         log=log)
        bam = sam_to_bam(jar,
                         fq1,
                         fq2,
                         sam1,
                         sam2,
                         snakemake.output.bam,
                         out_dir,
                         threads=threads,
                         log=log)
        mccutils.remove(out_dir + "/tmp")

        mccutils.check_file_exists(snakemake.output.bam)
        with open(status_log, "w") as l:
            l.write("COMPLETED\n")

        mccutils.log("popoolationte2", "PopoolationTE2 preprocessing complete")

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("popoolationte2", "popoolationte2 preprocessing failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", snakemake.output.bam])
def make_local_css_js_copies(css_dir, js_dir, out_dir):
    mccutils.mkdir(out_dir + "/html/")
    mccutils.mkdir(out_dir + "/css/")
    for css in os.listdir(css_dir):
        mccutils.run_command(["cp", css_dir + "/" + css, out_dir + "/css/"])

    mccutils.mkdir(out_dir + "/js/")
    for js in os.listdir(js_dir):
        mccutils.run_command(["cp", js_dir + "/" + js, out_dir + "/js/"])
Exemple #20
0
def sam_to_bam(jar, fq1, fq2, sam1, sam2, bam, out_dir, threads=1, log=None):
    mccutils.log("popoolationte2", "converting SAM to BAM", log=log)
    mccutils.run_command([
        "java", "-Djava.io.tmpdir=" + out_dir + "/tmp", "-jar", jar, "se2pe",
        "--fastq1", fq1, "--fastq2", fq2, "--bam1", sam1, "--bam2", sam2,
        "--sort", "--output", bam
    ],
                         log=log)
    return bam
Exemple #21
0
def main():
    nonref_gff = snakemake.input.nonref_gff
    ref_gff = snakemake.input.ref_gff
    rm_out = snakemake.input.rm_out

    log = snakemake.params.log
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    mccutils.log("relocate2", "processing RelocaTE2 results")

    ref_insertions = get_insertions(
        ref_gff,
        sample_name,
        chromosomes,
        insert_type="ref",
        l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD,
        r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD,
        l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD,
        r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD)

    nonref_insertions = get_insertions(
        nonref_gff,
        sample_name,
        chromosomes,
        insert_type="nonref",
        l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD,
        r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD,
        l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD,
        r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD)

    ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name)

    all_insertions = ref_insertions + nonref_insertions

    if len(all_insertions) >= 1:
        all_insertions = mccutils.make_redundant_bed(all_insertions,
                                                     sample_name,
                                                     out_dir,
                                                     method="relocate2")
        mccutils.make_nonredundant_bed(all_insertions,
                                       sample_name,
                                       out_dir,
                                       method="relocate2")
    else:
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_relocate2_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_relocate2_nonredundant.bed"
        ])

    mccutils.log("relocate2", "RelocaTE2 postprocessing complete")
Exemple #22
0
def main():
    mccutils.log("popoolationte", "running PopoolationTE")
    ref_fasta = snakemake.input.ref_fasta
    taxonomy = snakemake.input.taxonomy
    te_gff = snakemake.input.te_gff
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    sam = snakemake.input.sam
    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("reference fasta: " + ref_fasta + "\n")
        l.write("Taxonomy TSV: " + taxonomy + "\n")
        l.write("TE GFF: " + te_gff + "\n")
        l.write("fastq1: " + fq1 + '\n')
        l.write("fastq2: " + fq2 + "\n")
        l.write("SAM: " + sam + "\n")

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    script_dir = snakemake.params.script_dir

    mccutils.log("popoolationte", "getting read length")
    read_length = get_read_length(fq1, fq2)
    mccutils.log("popoolationte", "calculating median insert size")
    median_insert_size = get_median_insert_size(sam)
    max_dist = int(median_insert_size * 3) + read_length
    mccutils.log("popoolationte",
                 "converting TE gff to PoPoolationTE known TE file")
    known_inserts = make_known_insert_file(te_gff, out_dir)
    mccutils.log("popoolationte", "running the PoPoolationTE workflow scripts")
    run_popoolationte(
        sam,
        ref_fasta,
        taxonomy,
        read_length,
        median_insert_size,
        max_dist,
        known_inserts,
        script_dir,
        out_dir,
        log=log,
        identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"],
        identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"],
        crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'],
        update_te_inserts_site_shift=config.
        UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'],
        estimate_polymorphism_min_qual=config.
        ESTIMATE_POLYMORPHISM['min-map-qual'],
        filter_min_count=config.FILTER['min-count'])

    mccutils.run_command(["touch", snakemake.output[0]])

    mccutils.remove(sam)
    mccutils.remove(fq1)
    mccutils.remove(fq2)
def repeat_mask(reference, te_fasta, chromosomes, procs, run_id, log, out):
    try:
        outdir = out + "/tmp/repeatmasker_" + run_id
        mccutils.mkdir(outdir)
        os.chdir(outdir)
        command = [
            "RepeatMasker", "-pa",
            str(procs), "-lib", te_fasta, "-dir", outdir, "-s", "-gff",
            "-nolow", "-no_is", reference
        ]
        mccutils.run_command(command, log=log)
        os.chdir(out)

        # RepeatMasker appears to override the custom database names during the ProcessRepeats
        # this step changes them back, more rules may be needed for other reference genomes
        ref_name = os.path.basename(reference)
        repeatmasker_gff = outdir + "/" + ref_name + ".out.gff"
        formatted_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs.gff"
        with open(repeatmasker_gff, "r") as rmgff:
            with open(formatted_ref_tes, "w") as outgff:
                for line in rmgff:
                    if "#" not in line:
                        line = line.replace("McClintock-int", "McClintock")
                        line = line.replace("POGON1", "pogo")
                        split_line = line.split("\t")
                        feats = split_line[8]
                        if split_line[0] in chromosomes:
                            te = feats.split(" ")[1]
                            te = te.replace('"', '').split(":")[1]
                            feats = ";".join(
                                ["ID=" + te, "Name=" + te, "Alias=" + te])
                            split_line[2] = te
                            split_line[8] = feats
                            line = "\t".join(split_line)

                            outgff.write(line + '\n')

        masked_fasta = outdir + "/" + ref_name + ".masked"
        fasta_lines = fix_fasta.fix_fasta_lines(masked_fasta, 80)

        mccutils.check_file_exists(formatted_ref_tes)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...Failed to run repeatmasker on: ",
              reference,
              "with lib:",
              te_fasta,
              "check file formatting...exiting...",
              file=sys.stderr)
        sys.exit(1)

    return formatted_ref_tes
Exemple #24
0
def main():
    insertions_bed = snakemake.input.insertions_bed
    deletions_bed = snakemake.input.deletions_bed
    insertions_support = snakemake.input.insertions_support
    deletions_support = snakemake.input.deletions_support
    te_gff = snakemake.input.te_gff
    te_taxonomy = snakemake.input.te_taxonomy
    chromosomes = snakemake.params.chromosomes.split(",")

    sample_name = snakemake.params.sample_name
    out_dir = snakemake.params.out_dir

    mccutils.log("tepid", "running TEPID post processing")
    te_to_family = get_te_family_map(te_taxonomy)
    te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family)
    insertions = read_insertions(insertions_bed,
                                 te_to_family,
                                 sample_name,
                                 te_pos_to_family,
                                 chromosomes,
                                 reference=False)
    insertions = add_support(insertions,
                             insertions_support,
                             threshold=config.READ_SUPPORT_THRESHOLD)

    deletions = read_insertions(deletions_bed,
                                te_to_family,
                                sample_name,
                                te_pos_to_family,
                                chromosomes,
                                reference=True)
    deletions = add_support(deletions,
                            deletions_support,
                            threshold=config.READ_SUPPORT_THRESHOLD)
    non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff,
                                                    te_to_family, sample_name)

    insertions += non_abs_ref_insertions
    if len(insertions) > 0:
        mccutils.make_redundant_bed(insertions,
                                    sample_name,
                                    out_dir,
                                    method="tepid")
        mccutils.make_nonredundant_bed(insertions,
                                       sample_name,
                                       out_dir,
                                       method="tepid")
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_tepid_redundant.bed"])
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_tepid_nonredundant.bed"])

    mccutils.log("tepid", "TEPID post processing complete")
Exemple #25
0
def main():
    install_path = snakemake.config['paths']['install'] + "/tools/"

    mccutils.remove(snakemake.params.zipfile)
    download_success = mccutils.download(snakemake.params.url,
                                         snakemake.params.zipfile,
                                         md5=snakemake.params.md5,
                                         max_attempts=3)

    if not download_success:
        print("teflon download failed... exiting...")
        print("try running --install with --clean for clean installation")
        sys.exit(1)

    mccutils.remove(snakemake.config['paths']['install'] +
                    "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c")
    command = ["unzip", snakemake.params.zipfile]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c")
    command = [
        "mv", snakemake.config['paths']['install'] +
        "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c", install_path
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path + "teflon")
    mccutils.mkdir(install_path + "teflon")

    for f in os.listdir(install_path +
                        "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c"):
        command = [
            "mv", install_path +
            "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c/" + f,
            install_path + "teflon"
        ]
        mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.pseudo2refConvert_patch,
        install_path + "teflon/teflon_scripts/pseudo2refConvert.py"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    command = [
        "patch", "-i", snakemake.params.teflon_patch,
        install_path + "teflon/teflon.v0.4.py"
    ]
    mccutils.run_command(command, log=snakemake.params.log)

    mccutils.remove(install_path +
                    "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c")
    mccutils.remove(snakemake.params.zipfile)
Exemple #26
0
def main():
    consensus_fasta = snakemake.input.consensus_fasta
    reference_fasta = snakemake.input.reference_fasta
    fastq1 = snakemake.input.fastq1
    fastq2 = snakemake.input.fastq2

    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("consensus fasta: " + consensus_fasta + "\n")
        l.write("reference fasta: " + reference_fasta + "\n")
        l.write("fastq1: " + fastq1 + "\n")
        l.write("fastq2: " + fastq2 + "\n")

    threads = snakemake.threads
    sample_name = snakemake.params.sample_name
    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    out_bed = snakemake.output[0]

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(out_dir):
        mccutils.remove(out_dir + "/" + f)

    is_paired = True
    if snakemake.params.raw_fq2 == "None":
        is_paired = False

    command = [
        'Rscript', "--vanilla", script_dir + "/ngs_te_mapper.R",
        "genome=" + reference_fasta, "teFile=" + consensus_fasta,
        "tsd=" + str(config.MAX_TSD), "thread=" + str(threads),
        "output=" + out_dir, "sourceCodeFolder=" + script_dir
    ]

    if is_paired:
        command.append("sample=" + fastq1 + ";" + fastq2)
    else:
        command.append("sample=" + fastq1)

    mccutils.log("ngs_te_mapper", "running ngs_te_mapper", log=log)
    mccutils.run_command(command, log=log)
    mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete", log=log)

    raw_bed = ""
    for f in os.listdir(out_dir + "/bed_tsd/"):
        if "insertions.bed" in f:
            raw_bed = out_dir + "/bed_tsd/" + f

    mccutils.run_command(["cp", raw_bed, out_bed])

    mccutils.remove(out_dir + "/aligned_te/")

    mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete")
Exemple #27
0
def filter_jitterbug(script_dir,
                     jitterbug_gff,
                     filter_config,
                     sample_name,
                     filtered_gff,
                     log=None):
    command = [
        script_dir + "tools/jitterbug_filter_results_func.py", "-g",
        jitterbug_gff, "-c", filter_config, "-o", filtered_gff
    ]

    mccutils.run_command(command, log=log)
def popoolationte2_frequency(jar, ppileup, signatures, out, log=None):
    mccutils.log("popoolationte2",
                 "estimating frequencies for signatures of TE insertions",
                 log=log)
    freq_signatures = out + "/output.stranded.signatures.freq"
    mccutils.run_command([
        "java", "-jar", jar, "frequency", "--ppileup", ppileup, "--signature",
        signatures, "--output", freq_signatures
    ],
                         log=log)

    return freq_signatures
Exemple #29
0
def prep_annotations(script_dir, out_dir, ref_bed, taxonomy, consensus, reference, log=None):
    command = [
        "python", script_dir+"teflon_prep_annotation.py",
        "-wd", out_dir,
        "-a", ref_bed,
        "-t", taxonomy,
        "-f", consensus,
        "-g", reference,
        "-p", "teflon"
    ]

    mccutils.run_command(command, log=log)
Exemple #30
0
def main():
    reference_te_gff = snakemake.input.reference_tes
    bam = snakemake.input.bam

    out_dir = snakemake.params.out_dir
    script_dir = snakemake.params.script_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    status_log = snakemake.params.status_log
    threads = snakemake.threads

    out = snakemake.output.out

    mccutils.log("jitterbug", "Running jitterbug", log=log)

    try:
        out_gff, config_file = run_jitterbug(
            script_dir,
            bam,
            reference_te_gff,
            sample_name,
            out_dir,
            minmapq=config.RUN['MINMAPQ'],
            min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'],
            threads=threads,
            log=log)

        config_file = make_config(
            config_file,
            out_dir,
            cluster_size=config.FILTER["CLUSTER_SIZE"],
            span=config.FILTER['SPAN'],
            int_size=config.FILTER['INT_SIZE'],
            softclipped=config.FILTER['SOFTCLIPPED'],
            pick_consistent=config.FILTER['PICK_CONSISTENT'])
        filter_jitterbug(script_dir,
                         out_gff,
                         config_file,
                         sample_name,
                         out,
                         log=log)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        with open(log, "a") as l:
            print(track, file=l)
        mccutils.log("Jitterbug", "Jitterbug run failed")
        with open(status_log, "w") as l:
            l.write("FAILED\n")

        mccutils.run_command(["touch", out])