Ejemplo n.º 1
0
def main():
    consensus_fasta = snakemake.input.consensus_fasta
    bam = snakemake.input.bam
    ref_fasta = snakemake.input.ref_fasta
    ref_te_bed = snakemake.input.ref_te_bed
    taxonomy = snakemake.input.taxonomy
    log = snakemake.params.log

    with open(log,"a") as l:
        l.write("consensus fasta: "+consensus_fasta+"\n")
        l.write("BAM: "+bam+"\n")
        l.write("reference fasta: "+ref_fasta+"\n")
        l.write("taxonomy TSV: "+ taxonomy+"\n")
        

    script_dir = snakemake.params.script_dir
    out_dir = snakemake.params.out_dir
    ref_name = snakemake.params.ref_name
    sample_name = snakemake.params.sample_name

    mccutils.log("retroseq","running RetroSeq", log=log)    

    elements = split_consensus_fasta(consensus_fasta, ref_name, out_dir)

    bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir)

    run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMETERS, log=log)
    mccutils.log("retroseq","RetroSeq complete")
Ejemplo n.º 2
0
def main():
    mccutils.log("teflon", "setting up for TEFLoN")

    te_gff = snakemake.input.te_gff
    taxonomy = snakemake.input.taxonomy
    consensus = snakemake.input.consensus
    reference_genome = snakemake.input.reference_genome
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2

    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    script_dir = snakemake.params.script_dir
    log = snakemake.params.log

    ref_bed = snakemake.output.ref_bed
    teflon_taxonomy = snakemake.output.teflon_taxonomy

    make_reference_bed(te_gff, ref_bed)

    make_taxonomy_file(taxonomy, teflon_taxonomy)

    prep_annotations(script_dir,
                     out_dir,
                     ref_bed,
                     teflon_taxonomy,
                     consensus,
                     reference_genome,
                     log=log)

    map_reads(out_dir, fq1, fq2, threads=threads, log=log)

    mccutils.log("teflon", "setup for TEFLoN complete")
Ejemplo n.º 3
0
def main():
    mccutils.log("te-locate", "processing TE-Locate results")
    telocate_raw = snakemake.input.telocate_raw
    te_gff = snakemake.input.te_gff

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        telocate_raw,
        sample_name,
        chromosomes,
        rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD)
    insertions = filter_by_reference(insertions, te_gff)
    if len(insertions) > 0:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_telocate_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_telocate_nonredundant.bed"
        ])
    mccutils.log("te-locate", "TE-Locate post processing complete")
Ejemplo n.º 4
0
def main():
    log = snakemake.params.log
    tmp_dir = snakemake.params.tmp_dir
    mccutils.mkdir(tmp_dir + "/telocate")

    mccutils.log("processing", "making TE-locate taxonomy file", log=log)
    try:
        mccutils.run_command(
            ["cp", snakemake.input.ref_gff, "telocate_locations.gff"])
        mccutils.run_command(
            ["cp", snakemake.input.taxonomy, "telocate_taxonomy.tsv"])
        command = [
            "perl", snakemake.input.script, "telocate_locations.gff",
            "telocate_taxonomy.tsv", "Alias"
        ]
        mccutils.run_command(command, log=log)
        mccutils.run_command(
            ["cp", "telocate_locations_HL.gff", snakemake.output[0]])
        mccutils.check_file_exists(snakemake.output[0])

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...unable to produce TE-locate taxonomy file using",
              snakemake.input.script,
              file=sys.stderr)
        sys.exit(1)

    mccutils.log("processing", "TE-locate taxonomy file created")
Ejemplo n.º 5
0
def format_gff(ingff):
    mccutils.log("setup","checking locations gff: "+ingff)
    gff_ids = []
    with open(ingff,"r") as gff:
        for line in gff:
            if "#" not in line[0]:
                split_line = line.split("\t")
                if len(split_line) < 9:
                    sys.exit(ingff+" appears to be a malformed GFF file..exiting...\n")
                else:
                    feats = split_line[8]
                    split_feats = feats.split(";")
                    gff_id = ""
                    for feat in split_feats:
                        if feat[:3] == "ID=":
                            gff_id = feat.split("=")[1].replace("\n","")
                            masked_gff_id = mccutils.replace_special_chars(gff_id)
                            if gff_id != masked_gff_id:
                                mccutils.log("setup", ingff+": ERROR problematic symbol in feature name: "+gff_id+" ... reformat this feature name for compatibility with McClintock")
                                print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS))
                                sys.exit(1)

                            if masked_gff_id not in gff_ids:
                                gff_ids.append(masked_gff_id)
                            else:
                                sys.exit("ID: "+masked_gff_id+" is not unique. please ensure each feature has a unique ID\n")
                    if masked_gff_id == "":
                        sys.exit("GFF line: "+line+" is missing an ID attribute (ex. ID=chr1_TY1s1)\n")
    
    return gff_ids
Ejemplo n.º 6
0
def main():
    mccutils.log("popoolationte", "processing PopoolationTE results")
    popoolationte_out = snakemake.input.popoolationte_out

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    log = snakemake.params.log
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        popoolationte_out,
        sample_name,
        chromosomes,
        require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT,
        percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD)
    if len(insertions) >= 1:
        insertions = mccutils.make_redundant_bed(insertions,
                                                 sample_name,
                                                 out_dir,
                                                 method="popoolationte")
        mccutils.make_nonredundant_bed(insertions,
                                       sample_name,
                                       out_dir,
                                       method="popoolationte")
    else:
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte_nonredundant.bed"
        ])
    mccutils.log("popoolationte", "PopoolationTE postprocessing complete")
Ejemplo n.º 7
0
def main():
    mccutils.log("processing", "making coverage fasta")
    fastas = []
    try:
        length = 80
        if snakemake.params.coverage_fasta == "None":
            mccutils.run_command(["touch", snakemake.output.coverage_fasta])
        else:
            fasta3 = snakemake.params.coverage_fasta
            fastas.append(fasta3)
            lines = fix_fasta.fix_fasta_lines(fasta3, length)
            write_fasta(lines, snakemake.output.coverage_fasta)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "ERROR...failed to create coverage fasta, check the formatting of :",
            snakemake.params.coverage_fasta,
            file=sys.stderr)
        mccutils.remove(snakemake.output[0])
        mccutils.remove(snakemake.output[1])
        mccutils.remove(snakemake.output[2])
        sys.exit(1)

    mccutils.log("processing", "coverage fasta created")
Ejemplo n.º 8
0
def main():
    reference = snakemake.input.ref
    augment = snakemake.params.augment
    mcc_out = snakemake.params.mcc_out
    run_id = snakemake.params.run_id
    log = snakemake.params.log
    out_ref = snakemake.output.ref
    out_aug_ref = snakemake.output.aug_ref

    if not os.path.exists(mcc_out+"/tmp"):
        mccutils.mkdir(mcc_out+"/tmp")

    mccutils.log("processing","making reference fasta")

    tmp = mcc_out+"/tmp/"+str(run_id)+"reference.tmp"
    reference = fix_fasta_lines(reference, tmp)
    reference = mccutils.replace_special_chars_fasta(reference, tmp+"1")
    augmented_reference = reference
    if augment != "None":
        augment = fix_fasta_lines(augment, tmp+"2")
        augment = mccutils.replace_special_chars_fasta(augment, tmp+"3")
        augmented_reference = augment_reference(reference, augment, tmp+"4")
    
    mccutils.run_command(["cp", reference, out_ref])
    mccutils.run_command(["cp", augmented_reference, out_aug_ref])

    mccutils.log("processing","reference fasta created")
Ejemplo n.º 9
0
def main():
    mccutils.log("popoolationte2", "processing PopoolationTE2 results")
    te_predictions = snakemake.input.popoolationte2_out
    te_gff = snakemake.input.te_gff
    taxonomy = snakemake.input.taxonomy
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    log = snakemake.params.log

    ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes)
    insertions = read_insertions(
        te_predictions,
        ref_tes,
        chromosomes,
        sample_name,
        both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT,
        support_threshold=config.FREQUENCY_THRESHOLD)
    if len(insertions) >= 1:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed"
        ])

    mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
Ejemplo n.º 10
0
def main():
    mccutils.log("tebreak","running tebreak post processing")
    tebreak_out = snakemake.input.tebreak_out
    ref_fasta = snakemake.input.ref_fasta

    out_dir = snakemake.params.out_dir
    ref_name = snakemake.params.ref_name
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    status_log = snakemake.params.status_log

    prev_steps_succeeded = mccutils.check_status_file(status_log)
    if prev_steps_succeeded:
        insertions = read_insertions(tebreak_out, sample_name, chromosomes, config)

        if len(insertions) > 0:
            insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tebreak")
            insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tebreak")
            output.write_vcf(insertions, ref_fasta, sample_name, "tebreak", out_dir)
        else:
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"])
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"])
    else:
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"])
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"])
    
    mccutils.log("tebreak","tebreak postprocessing complete")
Ejemplo n.º 11
0
def main():
    relocate_gff = snakemake.input.relocate_gff
    te_gff = snakemake.input.te_gff

    out_dir = snakemake.params.out_dir
    log = snakemake.params.log
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    mccutils.log("relocate", "processing RelocaTE results")

    insertions = get_insertions(
        relocate_gff,
        sample_name,
        chromosomes,
        ref_l_threshold=config.REF_LEFT_THRESHOLD,
        ref_r_threshold=config.REF_RIGHT_THRESHOLD,
        nonref_l_threshold=config.NONREF_LEFT_THRESHOLD,
        nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD)

    insertions = set_ref_orientations(insertions, te_gff)

    if len(insertions) >= 1:
        insertions = make_redundant_bed(insertions, sample_name, out_dir)
        make_nonredundant_bed(insertions, sample_name, out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_relocate_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_relocate_nonredundant.bed"
        ])

    mccutils.log("relocate", "RelocaTE postprocessing complete")
Ejemplo n.º 12
0
def main():
    mccutils.log("retroseq", "processing RetroSeq results")
    retroseq_out = snakemake.input.retroseq_out
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    ref_name = snakemake.params.ref_name
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    insertions = read_insertions(
        retroseq_out,
        sample_name,
        chromosomes,
        support_threshold=config.READ_SUPPORT_THRESHOLD,
        breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD)
    if len(insertions) >= 1:
        insertions = output.make_redundant_bed(insertions,
                                               sample_name,
                                               out_dir,
                                               method="retroseq")
        insertions = output.make_nonredundant_bed(insertions,
                                                  sample_name,
                                                  out_dir,
                                                  method="retroseq")
        output.write_vcf(insertions, reference_fasta, sample_name, "retroseq",
                         out_dir)
    else:
        mccutils.run_command(
            ["touch", out_dir + "/" + sample_name + "_retroseq_redundant.bed"])
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_retroseq_nonredundant.bed"
        ])
    mccutils.log("retroseq", "RetroSeq post processing complete")
Ejemplo n.º 13
0
def sort_bam(bam, sorted_bam, threads=1, log=None):
    mccutils.log("popoolationte2", "sorting BAM", log=log)
    mccutils.run_command(
        ["samtools", "sort", "-@",
         str(threads), bam, "-o", sorted_bam],
        log=log)
    return sorted_bam
Ejemplo n.º 14
0
def main():
    ref_bed = snakemake.input.ref_bed
    nonref_bed = snakemake.input.nonref_bed
    reference_fasta = snakemake.input.reference_fasta

    threads = snakemake.threads
    log = snakemake.params.log
    sample_name = snakemake.params.sample_name
    out_dir = snakemake.params.out_dir
    chromosomes = snakemake.params.chromosomes.split(",")
    status_log = snakemake.params.status_log

    out_bed = snakemake.output[0]


    succeeded = mccutils.check_status_file(status_log)
    if succeeded:
        mccutils.log("ngs_te_mapper2","processing ngs_te_mapper2 results", log=log)
        insertions = read_insertions(ref_bed, nonref_bed, chromosomes, sample_name, out_dir)
        if len(insertions) > 0:
            insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper2")
            intertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper2")
            output.write_vcf(insertions, reference_fasta, sample_name, "ngs_te_mapper2", out_dir)

        else:
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_redundant.bed"])
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_nonredundant.bed"])
        
        mccutils.log("ngs_te_mapper2","ngs_te_mapper2 postprocessing complete")
    else:
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_redundant.bed"])
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_nonredundant.bed"])
Ejemplo n.º 15
0
def make_plots(te_names,
               all_coverage_files,
               uniq_coverage_files,
               avg_norm_te_depths,
               genome_depth,
               sample_name,
               out,
               trim_edges=0):
    mccutils.log("coverage", "creating TE coverage plots")
    mccutils.mkdir(out + "/plots")
    for x, te_name in enumerate(te_names):
        chrom, all_pos, all_cov = read_samtools_depth_file(
            all_coverage_files[x])
        chrom2, uniq_pos, uniq_cov = read_samtools_depth_file(
            uniq_coverage_files[x])

        plot_height = 3
        plot_width = 10
        hline = avg_norm_te_depths[x]
        output = out + "plots/" + te_name + ".png"
        plot = plot_coverage(chrom,
                             all_pos,
                             all_cov,
                             uniq_pos,
                             uniq_cov,
                             sample_name,
                             plot_height,
                             plot_width,
                             genome_depth,
                             hline,
                             trim_edges=trim_edges)
        plot.savefig(output, bbox_inches="tight")
        plot.close()
        mccutils.log("coverage", "plot created: " + output)
Ejemplo n.º 16
0
def map_reads(ref, fq1, fq2, out, threads=1, log=None):
    mccutils.log("popoolationte2", "mapping reads", log=log)
    sam = out + "/" + "mapped.sam"
    mccutils.run_command_stdout(
        ["bwa", "bwasw", "-t",
         str(threads), ref, fq1, fq2], sam, log=log)
    return sam
Ejemplo n.º 17
0
def main():
    mccutils.log("teflon", "Running TEFLoN")

    consensus = snakemake.input.consensus
    reference_genome = snakemake.input.reference_genome
    ref_bed = snakemake.input.ref_bed
    teflon_taxonomy = snakemake.input.teflon_taxonomy
    bam = snakemake.input.bam

    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    script_dir = snakemake.params.script_dir
    log = snakemake.params.log

    sample_table = make_sample_table(out_dir, bam)

    run_teflon(script_dir,
               out_dir,
               sample_table,
               threads=threads,
               log=log,
               quality_threshold=config.PARAMETERS['q'],
               stdev=config.PARAMETERS['sd'],
               cov=config.PARAMETERS['cov'],
               te_support1=config.PARAMETERS['n1'],
               te_support2=config.PARAMETERS['n2'],
               read_count_lower_threshold=config.PARAMETERS['lt'],
               read_count_higher_threshold=config.PARAMETERS['ht'])
Ejemplo n.º 18
0
def make_nonte_bed(reference, masked_gff, run_id, out, log):
    mccutils.log("coverage", "creating BED file of non-TE regions", log=log)
    masked_bed = out + "/input/" + run_id + "_ref_tes.bed"
    repeatmasker_gff_to_bed(masked_gff, masked_bed)

    sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed"
    mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed],
                                sorted_bed,
                                log=log)

    chromosome_names = []
    with open(reference, "r") as fa:
        for line in fa:
            if ">" in line:
                chromosome_names.append(
                    line.replace(">", "").replace("\n", ""))

    chrom_idx = out + "/input/" + run_id + "_ref.genome"
    with open(reference + ".fai", "r") as faidx:
        with open(chrom_idx, "w") as genome:
            for line in faidx:
                split_line = line.split("\t")
                out_line = "\t".join([split_line[0], split_line[1]])
                genome.write(out_line + "\n")

    non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed"
    command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx]
    mccutils.run_command_stdout(command, non_te_bed, log=log)

    for f in [masked_bed, sorted_bed, chrom_idx]:
        mccutils.remove(f)

    return non_te_bed
Ejemplo n.º 19
0
def main():

    bam = snakemake.input.bam
    twobit = snakemake.input.twobit
    consensus = snakemake.input.consensus
    ref_te_bed = snakemake.input.ref_te_bed
    taxonomy = snakemake.input.taxonomy
    median_insert_size_file = snakemake.input.median_insert_size
    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("BAM: " + bam + "\n")
        l.write("2bit: " + twobit + "\n")
        l.write("consensus fasta: " + consensus + "\n")
        l.write("reference TE BED: " + ref_te_bed + "\n")
        l.write("Taxonomy TSV: " + taxonomy + "\n")

    threads = snakemake.threads
    out_dir = snakemake.params.out_dir
    scripts_dir = snakemake.params.scripts_dir
    sample_name = snakemake.params.sample_name

    mccutils.log("temp", "running TEMP Module")
    median_insert_size = get_median_insert_size(median_insert_size_file)

    run_temp_insertion(bam, scripts_dir, consensus, ref_te_bed, taxonomy,
                       median_insert_size, threads, out_dir, log)

    run_temp_absence(bam, scripts_dir, consensus, ref_te_bed, twobit, taxonomy,
                     median_insert_size, threads, out_dir, log)

    for f in os.listdir(out_dir):
        if ".sorted.bam" in f or ".fastq" in f:
            mccutils.remove(f)
Ejemplo n.º 20
0
def main():
    mccutils.log("processing", "making PopoolationTE reference fasta")
    command = [
        "cat", snakemake.input[0], snakemake.input[1], snakemake.input[2]
    ]
    mccutils.run_command_stdout(command, snakemake.output[0])
    mccutils.log("processing", "PopoolationTE reference fasta created")
Ejemplo n.º 21
0
def main():
    mccutils.log("retroseq","processing RetroSeq results")
    retroseq_out = snakemake.input.retroseq_out
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    ref_name = snakemake.params.ref_name
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    status_log = snakemake.params.status_log

    prev_steps_succeeded = mccutils.check_status_file(status_log)

    if prev_steps_succeeded:
        insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.PARAMS["read_support_threshold"], breakpoint_threshold=config.PARAMS["breakpoint_confidence_threshold"])
        if len(insertions) >= 1:
            insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq")
            insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq")
            output.write_vcf(insertions, reference_fasta, sample_name, "retroseq", out_dir)
        else:
            mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_redundant.bed"])
            mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_nonredundant.bed"])
    else:
            mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_redundant.bed"])
            mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_nonredundant.bed"])
    
    mccutils.log("retroseq","RetroSeq post processing complete")
Ejemplo n.º 22
0
def main():
    mccutils.log("te-locate","processing TE-Locate results")
    telocate_raw = snakemake.input.telocate_raw
    te_gff = snakemake.input.te_gff
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    status_log = snakemake.params.status_log

    prev_steps_succeeded = mccutils.check_status_file(status_log)

    if prev_steps_succeeded:
        insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.PARAMS['read_pair_support_threshold'])
        insertions = filter_by_reference(insertions, te_gff)
        if len(insertions) > 0:
            insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="telocate")
            intertions = output.make_nonredundant_bed(insertions, sample_name, out_dir,method="telocate")
            output.write_vcf(insertions, reference_fasta, sample_name, "telocate", out_dir)
        else:
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_redundant.bed"])
            mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_nonredundant.bed"])
    else:
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_redundant.bed"])
        mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_nonredundant.bed"])
    mccutils.log("te-locate", "TE-Locate post processing complete")
Ejemplo n.º 23
0
def main():
    log = snakemake.params.log
    mccutils.log("processing",
                 "creating 2bit file from reference genome fasta",
                 log=log)
    command = ["faToTwoBit", snakemake.input[0], snakemake.output[0]]
    mccutils.run_command(command, log=log)
    mccutils.log("processing", "reference 2bit file created")
Ejemplo n.º 24
0
def main():
    mccutils.log("jitterbug", "jitterbug postprocessing")

    jitterbug_out = snakemake.input.jitterbug_out
    te_taxonomy = snakemake.input.taxonomy
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    log = snakemake.params.log
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    status_log = snakemake.params.status_log

    out = snakemake.output.out

    prev_steps_succeeded = mccutils.check_status_file(status_log)

    if prev_steps_succeeded:
        insertions = read_insertions(
            jitterbug_out,
            te_taxonomy,
            chromosomes,
            sample_name,
            min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'],
            min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'],
            min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'],
            min_zygosity=config.FILTER['MIN_ZYGOSITY'])

        if len(insertions) >= 1:
            insertions = output.make_redundant_bed(insertions,
                                                   sample_name,
                                                   out_dir,
                                                   method="jitterbug")
            insertions = output.make_nonredundant_bed(insertions,
                                                      sample_name,
                                                      out_dir,
                                                      method="jitterbug")
            output.write_vcf(insertions, reference_fasta, sample_name,
                             "jitterbug", out_dir)
        else:
            mccutils.run_command([
                "touch",
                out_dir + "/" + sample_name + "_jitterbug_redundant.bed"
            ])
            mccutils.run_command([
                "touch",
                out_dir + "/" + sample_name + "_jitterbug_nonredundant.bed"
            ])

    else:
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_jitterbug_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_jitterbug_nonredundant.bed"
        ])
Ejemplo n.º 25
0
def sam_to_bam(jar, fq1, fq2, sam1, sam2, bam, out_dir, threads=1, log=None):
    mccutils.log("popoolationte2", "converting SAM to BAM", log=log)
    mccutils.run_command([
        "java", "-Djava.io.tmpdir=" + out_dir + "/tmp", "-jar", jar, "se2pe",
        "--fastq1", fq1, "--fastq2", fq2, "--bam1", sam1, "--bam2", sam2,
        "--sort", "--output", bam
    ],
                         log=log)
    return bam
Ejemplo n.º 26
0
def main():
    mccutils.log("popoolationte2", "processing PopoolationTE2 results")
    te_predictions = snakemake.input.popoolationte2_out
    te_gff = snakemake.input.te_gff
    taxonomy = snakemake.input.taxonomy
    reference_fasta = snakemake.input.reference_fasta

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")
    log = snakemake.params.log

    status_log = snakemake.params.status_log

    prev_step_succeeded = mccutils.check_status_file(status_log)

    if prev_step_succeeded:
        ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes)
        insertions = read_insertions(
            te_predictions,
            ref_tes,
            chromosomes,
            sample_name,
            both_end_support_needed=config.PARAMS["require_both_end_support"],
            support_threshold=config.PARAMS["frequency_threshold"])
        if len(insertions) >= 1:
            insertions = output.make_redundant_bed(insertions,
                                                   sample_name,
                                                   out_dir,
                                                   method="popoolationte2")
            insertions = output.make_nonredundant_bed(insertions,
                                                      sample_name,
                                                      out_dir,
                                                      method="popoolationte2")
            output.write_vcf(insertions, reference_fasta, sample_name,
                             "popoolationte2", out_dir)
        else:
            mccutils.run_command([
                "touch",
                out_dir + "/" + sample_name + "_popoolationte2_redundant.bed"
            ])
            mccutils.run_command([
                "touch", out_dir + "/" + sample_name +
                "_popoolationte2_nonredundant.bed"
            ])
    else:
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed"
        ])

    mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
Ejemplo n.º 27
0
def make_depth_table(te_fasta, bam, genome_depth, run_id, out, depth_csv, log, trim_edges=0):
    mccutils.log("coverage","creating TE depth coverage table", log=log)
    with open(depth_csv, "w") as table:
            table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth"+"\n")
    
    te_names = []
    uniq_coverage_files = []
    all_coverage_files = []
    avg_norm_depths = []
    avg_uniq_norm_depths = []

    with open(te_fasta,"r") as fa:
        for line in fa:
            if ">" in line:
                te_name = line.replace("\n","")
                te_name = te_name.replace(">","")

                mccutils.mkdir(out+"/te-depth-files")
                highQ = out+"/te-depth-files/"+te_name+".highQ.cov"
                command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "1"]
                mccutils.run_command_stdout(command, highQ, log=log)

                allQ = out+"/te-depth-files/"+te_name+".allQ.cov"
                command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "0"]
                mccutils.run_command_stdout(command, allQ, log=log)

                # make normalized coverage files
                allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ)
                with open(out+"/te-depth-files/"+te_name+".allQ.normalized.cov","w") as covfile:
                    for i,pos in enumerate(allQ_pos):
                        cov = str(round(allQ_cov[i]/genome_depth,2))
                        line = "\t".join([allQ_chrom,str(pos),cov])
                        covfile.write(line+"\n")
                
                highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file(highQ)
                with open(out+"/te-depth-files/"+te_name+".highQ.normalized.cov","w") as covfile:
                    for i,pos in enumerate(highQ_pos):
                        cov = str(round(highQ_cov[i]/genome_depth,2))
                        line = "\t".join([highQ_chrom,str(pos),cov])
                        covfile.write(line+"\n")

                avg_depth = get_avg_depth(allQ, trim_edges=trim_edges)
                avg_norm_depth = avg_depth/genome_depth

                avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges)
                avg_uniq_norm_depth = avg_uniq_depth/genome_depth

                with open(depth_csv, "a") as table:
                    table.write(te_name+","+str(round(avg_norm_depth,2))+","+str(round(avg_uniq_norm_depth,2))+"\n")
    
                te_names.append(te_name)
                uniq_coverage_files.append(highQ)
                all_coverage_files.append(allQ)
                avg_norm_depths.append(avg_norm_depth)
    
    return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
Ejemplo n.º 28
0
def main():
    consensus = snakemake.input.consensus
    mcc_out = snakemake.params.mcc_out
    run_id = snakemake.params.run_id
    out_consensus = snakemake.output.consensus

    mccutils.log("processing", "making consensus fasta")
    out_consensus = fix_fasta_lines(consensus, out_consensus)

    mccutils.log("processing", "consensus fasta created")
Ejemplo n.º 29
0
def main():
    mccutils.log("popoolationte", "running PopoolationTE")
    ref_fasta = snakemake.input.ref_fasta
    taxonomy = snakemake.input.taxonomy
    te_gff = snakemake.input.te_gff
    fq1 = snakemake.input.fq1
    fq2 = snakemake.input.fq2
    sam = snakemake.input.sam
    log = snakemake.params.log
    with open(log, "a") as l:
        l.write("reference fasta: " + ref_fasta + "\n")
        l.write("Taxonomy TSV: " + taxonomy + "\n")
        l.write("TE GFF: " + te_gff + "\n")
        l.write("fastq1: " + fq1 + '\n')
        l.write("fastq2: " + fq2 + "\n")
        l.write("SAM: " + sam + "\n")

    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    script_dir = snakemake.params.script_dir

    mccutils.log("popoolationte", "getting read length")
    read_length = get_read_length(fq1, fq2)
    mccutils.log("popoolationte", "calculating median insert size")
    median_insert_size = get_median_insert_size(sam)
    max_dist = int(median_insert_size * 3) + read_length
    mccutils.log("popoolationte",
                 "converting TE gff to PoPoolationTE known TE file")
    known_inserts = make_known_insert_file(te_gff, out_dir)
    mccutils.log("popoolationte", "running the PoPoolationTE workflow scripts")
    run_popoolationte(
        sam,
        ref_fasta,
        taxonomy,
        read_length,
        median_insert_size,
        max_dist,
        known_inserts,
        script_dir,
        out_dir,
        log=log,
        identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"],
        identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"],
        crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'],
        update_te_inserts_site_shift=config.
        UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'],
        estimate_polymorphism_min_qual=config.
        ESTIMATE_POLYMORPHISM['min-map-qual'],
        filter_min_count=config.FILTER['min-count'])

    mccutils.run_command(["touch", snakemake.output[0]])

    mccutils.remove(sam)
    mccutils.remove(fq1)
    mccutils.remove(fq2)
Ejemplo n.º 30
0
def main():
    nonref_gff = snakemake.input.nonref_gff
    ref_gff = snakemake.input.ref_gff
    rm_out = snakemake.input.rm_out

    log = snakemake.params.log
    out_dir = snakemake.params.out_dir
    sample_name = snakemake.params.sample_name
    chromosomes = snakemake.params.chromosomes.split(",")

    mccutils.log("relocate2", "processing RelocaTE2 results")

    ref_insertions = get_insertions(
        ref_gff,
        sample_name,
        chromosomes,
        insert_type="ref",
        l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD,
        r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD,
        l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD,
        r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD)

    nonref_insertions = get_insertions(
        nonref_gff,
        sample_name,
        chromosomes,
        insert_type="nonref",
        l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD,
        r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD,
        l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD,
        r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD)

    ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name)

    all_insertions = ref_insertions + nonref_insertions

    if len(all_insertions) >= 1:
        all_insertions = mccutils.make_redundant_bed(all_insertions,
                                                     sample_name,
                                                     out_dir,
                                                     method="relocate2")
        mccutils.make_nonredundant_bed(all_insertions,
                                       sample_name,
                                       out_dir,
                                       method="relocate2")
    else:
        mccutils.run_command([
            "touch", out_dir + "/" + sample_name + "_relocate2_redundant.bed"
        ])
        mccutils.run_command([
            "touch",
            out_dir + "/" + sample_name + "_relocate2_nonredundant.bed"
        ])

    mccutils.log("relocate2", "RelocaTE2 postprocessing complete")