def main(): mccutils.log("processing", "making coverage fasta") fastas = [] try: length = 80 if snakemake.params.coverage_fasta == "None": mccutils.run_command(["touch", snakemake.output.coverage_fasta]) else: fasta3 = snakemake.params.coverage_fasta fastas.append(fasta3) lines = fix_fasta.fix_fasta_lines(fasta3, length) write_fasta(lines, snakemake.output.coverage_fasta) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "ERROR...failed to create coverage fasta, check the formatting of :", snakemake.params.coverage_fasta, file=sys.stderr) mccutils.remove(snakemake.output[0]) mccutils.remove(snakemake.output[1]) mccutils.remove(snakemake.output[2]) sys.exit(1) mccutils.log("processing", "coverage fasta created")
def main(): mccutils.log("popoolationte2", "processing PopoolationTE2 results") te_predictions = snakemake.input.popoolationte2_out te_gff = snakemake.input.te_gff taxonomy = snakemake.input.taxonomy out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") log = snakemake.params.log ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes) insertions = read_insertions( te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT, support_threshold=config.FREQUENCY_THRESHOLD) if len(insertions) >= 1: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed" ]) mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
def run_retroseq(bam, bed_locations, ref_fasta, script_dir, sample_name, out_dir, params, log=None): discovery_out = out_dir + "/" + sample_name + ".discovery" command = [ "perl", script_dir + "/retroseq.pl", "-discover", "-bam", bam, "-refTEs", bed_locations, "-output", discovery_out, "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q']) ] mccutils.run_command(command, log=log) call_out = out_dir + "/" + sample_name + ".call" command = [ "perl", script_dir + "/retroseq.pl", "-call", "-bam", bam, "-input", discovery_out, "-filter", bed_locations, "-ref", ref_fasta, "-output", call_out, "-orientate", "yes", "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q']) ] mccutils.run_command(command, log=log)
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0): unsorted_bed = out_dir+"/unsorted.bed" with open(unsorted_bed, "w") as outbed: with open(bed,"r") as inbed: insertion_count = 0 for x,line in enumerate(inbed): line = line.replace(";","\t") split_line = line.split("\t") if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes: insertion_count += 1 outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]]) outbed.write(outline+"\n") if insertion_count >= 1: sorted_bed = out_dir+"/sorted.bed" command = ["bedtools", "sort", "-i", unsorted_bed] mccutils.run_command_stdout(command, sorted_bed, log=log) final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed" with open(final_bed,"w") as outbed: header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n' outbed.write(header) with open(sorted_bed, "r") as inbed: for line in inbed: # line = line.replace("NA",".") outbed.write(line) mccutils.remove(sorted_bed) else: mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"]) mccutils.remove(unsorted_bed)
def sort_bam(bam, sorted_bam, threads=1, log=None): mccutils.log("popoolationte2", "sorting BAM", log=log) mccutils.run_command( ["samtools", "sort", "-@", str(threads), bam, "-o", sorted_bam], log=log) return sorted_bam
def main(): mccutils.log("retroseq", "processing RetroSeq results") retroseq_out = snakemake.input.retroseq_out reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( retroseq_out, sample_name, chromosomes, support_threshold=config.READ_SUPPORT_THRESHOLD, breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq") output.write_vcf(insertions, reference_fasta, sample_name, "retroseq", out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_retroseq_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_retroseq_nonredundant.bed" ]) mccutils.log("retroseq", "RetroSeq post processing complete")
def main(): reference = snakemake.input.ref augment = snakemake.params.augment mcc_out = snakemake.params.mcc_out run_id = snakemake.params.run_id log = snakemake.params.log out_ref = snakemake.output.ref out_aug_ref = snakemake.output.aug_ref if not os.path.exists(mcc_out+"/tmp"): mccutils.mkdir(mcc_out+"/tmp") mccutils.log("processing","making reference fasta") tmp = mcc_out+"/tmp/"+str(run_id)+"reference.tmp" reference = fix_fasta_lines(reference, tmp) reference = mccutils.replace_special_chars_fasta(reference, tmp+"1") augmented_reference = reference if augment != "None": augment = fix_fasta_lines(augment, tmp+"2") augment = mccutils.replace_special_chars_fasta(augment, tmp+"3") augmented_reference = augment_reference(reference, augment, tmp+"4") mccutils.run_command(["cp", reference, out_ref]) mccutils.run_command(["cp", augmented_reference, out_aug_ref]) mccutils.log("processing","reference fasta created")
def main(): relocate_gff = snakemake.input.relocate_gff te_gff = snakemake.input.te_gff out_dir = snakemake.params.out_dir log = snakemake.params.log sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") mccutils.log("relocate", "processing RelocaTE results") insertions = get_insertions( relocate_gff, sample_name, chromosomes, ref_l_threshold=config.REF_LEFT_THRESHOLD, ref_r_threshold=config.REF_RIGHT_THRESHOLD, nonref_l_threshold=config.NONREF_LEFT_THRESHOLD, nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD) insertions = set_ref_orientations(insertions, te_gff) if len(insertions) >= 1: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_relocate_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate_nonredundant.bed" ]) mccutils.log("relocate", "RelocaTE postprocessing complete")
def run_retroseq(bam, bed_locations, ref_fasta, script_dir, sample_name, out_dir, params, log=None): discovery_out = out_dir+"/"+sample_name+".discovery" command = ["perl", script_dir+"/retroseq.pl", "-discover", "-bam", bam, "-refTEs", bed_locations, "-output", discovery_out] for param in params.keys(): command.append(param) command.append(str(params[param])) mccutils.run_command(command, log=log) call_out = out_dir+"/"+sample_name+".call" command = ["perl", script_dir+"/retroseq.pl", "-call", "-bam", bam, "-input", discovery_out, "-filter", bed_locations, "-ref", ref_fasta, "-output", call_out, "-orientate", "yes"] for param in params.keys(): command.append(param) command.append(str(params[param])) mccutils.run_command(command, log=log)
def map_reads(fq1, fq2, ref_name, median_insert_size, out, threads=1, paired=True, log=None): os.chdir(out) if paired: command = [ "tepid-map", "-x", out + "/" + ref_name, "-y", out + "/" + ref_name + ".X15_01_65525S", "-p", str(threads), "-s", median_insert_size, "-n", ref_name, "-1", fq1, "-2", fq2 ] else: command = [ "tepid-map-se", "-x", out + "/" + ref_name, "-y", out + "/" + ref_name + ".X15_01_65525S", "-p", str(threads), "-n", ref_name, "-q", fq1 ] mccutils.run_command(command, log=log) bam = out + "/" + ref_name + ".bam" split_bam = out + "/" + ref_name + ".split.bam" mccutils.check_file_exists(bam) mccutils.check_file_exists(split_bam) return bam, split_bam
def run_repeatmasker(reference, ref_name, te_seqs, threads, log, outfile, outdir): tmp_dir = outdir + "/tmp/repeatmasker" mccutils.remove(tmp_dir) mccutils.mkdir(tmp_dir) os.chdir(tmp_dir) command = [ "RepeatMasker", "-pa", str(threads), "-lib", te_seqs, "-dir", tmp_dir, "-s", "-nolow", "-no_is", reference ] mccutils.run_command(command, log=log) os.chdir(outdir) rm_out = "" for f in os.listdir(tmp_dir): if "fasta.out" in f and f[-9:] == "fasta.out": rm_out = tmp_dir + "/" + f if rm_out == "": sys.exit("can't find Repeatmasker output in:" + tmp_dir + "\n") mccutils.run_command(["mv", rm_out, outfile])
def main(): mccutils.log("popoolationte", "processing PopoolationTE results") popoolationte_out = snakemake.input.popoolationte_out out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( popoolationte_out, sample_name, chromosomes, require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT, percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD) if len(insertions) >= 1: insertions = mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte") mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte") else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte_nonredundant.bed" ]) mccutils.log("popoolationte", "PopoolationTE postprocessing complete")
def main(): mccutils.log("te-locate", "processing TE-Locate results") telocate_raw = snakemake.input.telocate_raw te_gff = snakemake.input.te_gff out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( telocate_raw, sample_name, chromosomes, rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD) insertions = filter_by_reference(insertions, te_gff) if len(insertions) > 0: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_telocate_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_telocate_nonredundant.bed" ]) mccutils.log("te-locate", "TE-Locate post processing complete")
def main(): install_path = snakemake.config['paths']['install'] + "/tools/" mccutils.remove(snakemake.params.zipfile) download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("teflon download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) mccutils.remove(snakemake.config['paths']['install'] + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46") command = ["unzip", snakemake.params.zipfile] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46") command = [ "mv", snakemake.config['paths']['install'] + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46", install_path ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "teflon") mccutils.mkdir(install_path + "teflon") for f in os.listdir(install_path + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46"): command = [ "mv", install_path + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46/" + f, install_path + "teflon" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.pseudo2refConvert_patch, install_path + "teflon/teflon_scripts/pseudo2refConvert.py" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.teflon_patch, install_path + "teflon/teflon.v0.4.py" ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "TEFLoN-3e2d67886b70644fd1f7d79263b3c8dbed639e46") mccutils.remove(snakemake.params.zipfile) # write version to file with open( snakemake.config['paths']['install'] + "/tools/teflon/version.log", "w") as version: version.write(snakemake.params.md5)
def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, flags=[]): mccutils.mkdir(out+"/results/") command = ['trim_galore'] + flags + ["-j", str(cores), "-o", out+"/results/trimgalore"] if fq2 is None: command.append(fq1) else: command += [fq1, fq2] mccutils.run_command(command, log=log) if fq2 is None: outfq = "" for f in os.listdir(out+"/results/trimgalore"): if "_trimmed.fq" in f: outfq = out+"/results/trimgalore/"+f file_exists = mccutils.check_file_exists(outfq) return outfq else: outfq1 = "" outfq2 = "" for f in os.listdir(out+"/results/trimgalore"): if "_val_1.fq" in f: outfq1 = out+"/results/trimgalore/"+f elif "_val_2.fq" in f: outfq2= out+"/results/trimgalore/"+f file_exists = mccutils.check_file_exists(outfq1) file_exists = mccutils.check_file_exists(outfq2) return outfq1, outfq2
def combine_alignments(sam1, sam2, fq1, fq2, script_path, out, log=None): out_sam = out + "combined.sam" command = [ "perl", script_path + "samro.pl", "--sam1", sam1, "--sam2", sam2, "--fq1", fq1, "--fq2", fq2, "--output", out_sam ] mccutils.run_command(command, log=log) return out_sam
def main(): log = snakemake.params.log mccutils.log("processing", "creating 2bit file from reference genome fasta", log=log) command = ["faToTwoBit", snakemake.input[0], snakemake.output[0]] mccutils.run_command(command, log=log) mccutils.log("processing", "reference 2bit file created")
def main(): mccutils.log("popoolationte2", "setting up for PopoolationTE2") ref_fasta = snakemake.input.ref_fasta fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 jar = snakemake.params.jar log = snakemake.params.log out_dir = snakemake.params.out_dir threads = snakemake.threads status_log = snakemake.params.status_log try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) mccutils.mkdir(out_dir + "/tmp") index_fasta(ref_fasta, log=log) fq1 = format_fastq(fq1, out_dir + "/reads_1.fastq", log=log) fq2 = format_fastq(fq2, out_dir + "/reads_2.fastq", log=log) sam1 = map_reads(ref_fasta, fq1, out_dir + "/mapped_1.sam", threads=threads, log=log) sam2 = map_reads(ref_fasta, fq2, out_dir + "/mapped_2.sam", threads=threads, log=log) bam = sam_to_bam(jar, fq1, fq2, sam1, sam2, snakemake.output.bam, out_dir, threads=threads, log=log) mccutils.remove(out_dir + "/tmp") mccutils.check_file_exists(snakemake.output.bam) with open(status_log, "w") as l: l.write("COMPLETED\n") mccutils.log("popoolationte2", "PopoolationTE2 preprocessing complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("popoolationte2", "popoolationte2 preprocessing failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output.bam])
def make_local_css_js_copies(css_dir, js_dir, out_dir): mccutils.mkdir(out_dir + "/html/") mccutils.mkdir(out_dir + "/css/") for css in os.listdir(css_dir): mccutils.run_command(["cp", css_dir + "/" + css, out_dir + "/css/"]) mccutils.mkdir(out_dir + "/js/") for js in os.listdir(js_dir): mccutils.run_command(["cp", js_dir + "/" + js, out_dir + "/js/"])
def sam_to_bam(jar, fq1, fq2, sam1, sam2, bam, out_dir, threads=1, log=None): mccutils.log("popoolationte2", "converting SAM to BAM", log=log) mccutils.run_command([ "java", "-Djava.io.tmpdir=" + out_dir + "/tmp", "-jar", jar, "se2pe", "--fastq1", fq1, "--fastq2", fq2, "--bam1", sam1, "--bam2", sam2, "--sort", "--output", bam ], log=log) return bam
def main(): nonref_gff = snakemake.input.nonref_gff ref_gff = snakemake.input.ref_gff rm_out = snakemake.input.rm_out log = snakemake.params.log out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") mccutils.log("relocate2", "processing RelocaTE2 results") ref_insertions = get_insertions( ref_gff, sample_name, chromosomes, insert_type="ref", l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD, r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD, l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD, r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD) nonref_insertions = get_insertions( nonref_gff, sample_name, chromosomes, insert_type="nonref", l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD, r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD, l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD, r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD) ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name) all_insertions = ref_insertions + nonref_insertions if len(all_insertions) >= 1: all_insertions = mccutils.make_redundant_bed(all_insertions, sample_name, out_dir, method="relocate2") mccutils.make_nonredundant_bed(all_insertions, sample_name, out_dir, method="relocate2") else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate2_nonredundant.bed" ]) mccutils.log("relocate2", "RelocaTE2 postprocessing complete")
def main(): mccutils.log("popoolationte", "running PopoolationTE") ref_fasta = snakemake.input.ref_fasta taxonomy = snakemake.input.taxonomy te_gff = snakemake.input.te_gff fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 sam = snakemake.input.sam log = snakemake.params.log with open(log, "a") as l: l.write("reference fasta: " + ref_fasta + "\n") l.write("Taxonomy TSV: " + taxonomy + "\n") l.write("TE GFF: " + te_gff + "\n") l.write("fastq1: " + fq1 + '\n') l.write("fastq2: " + fq2 + "\n") l.write("SAM: " + sam + "\n") out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name script_dir = snakemake.params.script_dir mccutils.log("popoolationte", "getting read length") read_length = get_read_length(fq1, fq2) mccutils.log("popoolationte", "calculating median insert size") median_insert_size = get_median_insert_size(sam) max_dist = int(median_insert_size * 3) + read_length mccutils.log("popoolationte", "converting TE gff to PoPoolationTE known TE file") known_inserts = make_known_insert_file(te_gff, out_dir) mccutils.log("popoolationte", "running the PoPoolationTE workflow scripts") run_popoolationte( sam, ref_fasta, taxonomy, read_length, median_insert_size, max_dist, known_inserts, script_dir, out_dir, log=log, identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"], identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"], crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'], update_te_inserts_site_shift=config. UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'], estimate_polymorphism_min_qual=config. ESTIMATE_POLYMORPHISM['min-map-qual'], filter_min_count=config.FILTER['min-count']) mccutils.run_command(["touch", snakemake.output[0]]) mccutils.remove(sam) mccutils.remove(fq1) mccutils.remove(fq2)
def repeat_mask(reference, te_fasta, chromosomes, procs, run_id, log, out): try: outdir = out + "/tmp/repeatmasker_" + run_id mccutils.mkdir(outdir) os.chdir(outdir) command = [ "RepeatMasker", "-pa", str(procs), "-lib", te_fasta, "-dir", outdir, "-s", "-gff", "-nolow", "-no_is", reference ] mccutils.run_command(command, log=log) os.chdir(out) # RepeatMasker appears to override the custom database names during the ProcessRepeats # this step changes them back, more rules may be needed for other reference genomes ref_name = os.path.basename(reference) repeatmasker_gff = outdir + "/" + ref_name + ".out.gff" formatted_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs.gff" with open(repeatmasker_gff, "r") as rmgff: with open(formatted_ref_tes, "w") as outgff: for line in rmgff: if "#" not in line: line = line.replace("McClintock-int", "McClintock") line = line.replace("POGON1", "pogo") split_line = line.split("\t") feats = split_line[8] if split_line[0] in chromosomes: te = feats.split(" ")[1] te = te.replace('"', '').split(":")[1] feats = ";".join( ["ID=" + te, "Name=" + te, "Alias=" + te]) split_line[2] = te split_line[8] = feats line = "\t".join(split_line) outgff.write(line + '\n') masked_fasta = outdir + "/" + ref_name + ".masked" fasta_lines = fix_fasta.fix_fasta_lines(masked_fasta, 80) mccutils.check_file_exists(formatted_ref_tes) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...Failed to run repeatmasker on: ", reference, "with lib:", te_fasta, "check file formatting...exiting...", file=sys.stderr) sys.exit(1) return formatted_ref_tes
def main(): insertions_bed = snakemake.input.insertions_bed deletions_bed = snakemake.input.deletions_bed insertions_support = snakemake.input.insertions_support deletions_support = snakemake.input.deletions_support te_gff = snakemake.input.te_gff te_taxonomy = snakemake.input.te_taxonomy chromosomes = snakemake.params.chromosomes.split(",") sample_name = snakemake.params.sample_name out_dir = snakemake.params.out_dir mccutils.log("tepid", "running TEPID post processing") te_to_family = get_te_family_map(te_taxonomy) te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family) insertions = read_insertions(insertions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False) insertions = add_support(insertions, insertions_support, threshold=config.READ_SUPPORT_THRESHOLD) deletions = read_insertions(deletions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=True) deletions = add_support(deletions, deletions_support, threshold=config.READ_SUPPORT_THRESHOLD) non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name) insertions += non_abs_ref_insertions if len(insertions) > 0: mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="tepid") mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="tepid") else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_tepid_redundant.bed"]) mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_tepid_nonredundant.bed"]) mccutils.log("tepid", "TEPID post processing complete")
def main(): install_path = snakemake.config['paths']['install'] + "/tools/" mccutils.remove(snakemake.params.zipfile) download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("teflon download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) mccutils.remove(snakemake.config['paths']['install'] + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c") command = ["unzip", snakemake.params.zipfile] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c") command = [ "mv", snakemake.config['paths']['install'] + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c", install_path ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "teflon") mccutils.mkdir(install_path + "teflon") for f in os.listdir(install_path + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c"): command = [ "mv", install_path + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c/" + f, install_path + "teflon" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.pseudo2refConvert_patch, install_path + "teflon/teflon_scripts/pseudo2refConvert.py" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.teflon_patch, install_path + "teflon/teflon.v0.4.py" ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "TEFLoN-9eca0152f3dd9dc6c44787a30d590f7e321b442c") mccutils.remove(snakemake.params.zipfile)
def main(): consensus_fasta = snakemake.input.consensus_fasta reference_fasta = snakemake.input.reference_fasta fastq1 = snakemake.input.fastq1 fastq2 = snakemake.input.fastq2 log = snakemake.params.log with open(log, "a") as l: l.write("consensus fasta: " + consensus_fasta + "\n") l.write("reference fasta: " + reference_fasta + "\n") l.write("fastq1: " + fastq1 + "\n") l.write("fastq2: " + fastq2 + "\n") threads = snakemake.threads sample_name = snakemake.params.sample_name script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir out_bed = snakemake.output[0] # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) is_paired = True if snakemake.params.raw_fq2 == "None": is_paired = False command = [ 'Rscript', "--vanilla", script_dir + "/ngs_te_mapper.R", "genome=" + reference_fasta, "teFile=" + consensus_fasta, "tsd=" + str(config.MAX_TSD), "thread=" + str(threads), "output=" + out_dir, "sourceCodeFolder=" + script_dir ] if is_paired: command.append("sample=" + fastq1 + ";" + fastq2) else: command.append("sample=" + fastq1) mccutils.log("ngs_te_mapper", "running ngs_te_mapper", log=log) mccutils.run_command(command, log=log) mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete", log=log) raw_bed = "" for f in os.listdir(out_dir + "/bed_tsd/"): if "insertions.bed" in f: raw_bed = out_dir + "/bed_tsd/" + f mccutils.run_command(["cp", raw_bed, out_bed]) mccutils.remove(out_dir + "/aligned_te/") mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete")
def filter_jitterbug(script_dir, jitterbug_gff, filter_config, sample_name, filtered_gff, log=None): command = [ script_dir + "tools/jitterbug_filter_results_func.py", "-g", jitterbug_gff, "-c", filter_config, "-o", filtered_gff ] mccutils.run_command(command, log=log)
def popoolationte2_frequency(jar, ppileup, signatures, out, log=None): mccutils.log("popoolationte2", "estimating frequencies for signatures of TE insertions", log=log) freq_signatures = out + "/output.stranded.signatures.freq" mccutils.run_command([ "java", "-jar", jar, "frequency", "--ppileup", ppileup, "--signature", signatures, "--output", freq_signatures ], log=log) return freq_signatures
def prep_annotations(script_dir, out_dir, ref_bed, taxonomy, consensus, reference, log=None): command = [ "python", script_dir+"teflon_prep_annotation.py", "-wd", out_dir, "-a", ref_bed, "-t", taxonomy, "-f", consensus, "-g", reference, "-p", "teflon" ] mccutils.run_command(command, log=log)
def main(): reference_te_gff = snakemake.input.reference_tes bam = snakemake.input.bam out_dir = snakemake.params.out_dir script_dir = snakemake.params.script_dir sample_name = snakemake.params.sample_name log = snakemake.params.log status_log = snakemake.params.status_log threads = snakemake.threads out = snakemake.output.out mccutils.log("jitterbug", "Running jitterbug", log=log) try: out_gff, config_file = run_jitterbug( script_dir, bam, reference_te_gff, sample_name, out_dir, minmapq=config.RUN['MINMAPQ'], min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'], threads=threads, log=log) config_file = make_config( config_file, out_dir, cluster_size=config.FILTER["CLUSTER_SIZE"], span=config.FILTER['SPAN'], int_size=config.FILTER['INT_SIZE'], softclipped=config.FILTER['SOFTCLIPPED'], pick_consistent=config.FILTER['PICK_CONSISTENT']) filter_jitterbug(script_dir, out_gff, config_file, sample_name, out, log=log) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("Jitterbug", "Jitterbug run failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", out])