Exemple #1
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out + "/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff, "r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";", "\t")
                split_line = line.split("\t")
                insert = output.Insertion(output.Temp())
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.name = split_line[9].split(
                    "=")[1] + "|reference|NA|" + sample + "|temp|nonab|"
                insert.strand = split_line[6]
                insert.type = "reference"

                insertions.append(insert)

    mccutils.remove(tmp_gff)

    return insertions
Exemple #2
0
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0):
    unsorted_bed = out_dir+"/unsorted.bed"
    with open(unsorted_bed, "w") as outbed:
        with open(bed,"r") as inbed:
            insertion_count = 0
            for x,line in enumerate(inbed):
                line = line.replace(";","\t")
                split_line = line.split("\t")
                if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes:
                    insertion_count += 1
                    outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]])
                    outbed.write(outline+"\n")
    
    if insertion_count >= 1:
        sorted_bed = out_dir+"/sorted.bed"
        command = ["bedtools", "sort", "-i", unsorted_bed]
        mccutils.run_command_stdout(command, sorted_bed, log=log)

        final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"
        with open(final_bed,"w") as outbed:
            header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n'
            outbed.write(header)
            with open(sorted_bed, "r") as inbed:
                for line in inbed:
                    # line = line.replace("NA",".")
                    outbed.write(line)
        mccutils.remove(sorted_bed)
    else:
        mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"])
        
    mccutils.remove(unsorted_bed)
Exemple #3
0
def map_reads(out_dir, fq1, fq2, threads=1, log=None):
    reference_genome = out_dir+"/teflon.prep_MP/teflon.mappingRef.fa"
    command = ["bwa", "index", reference_genome]
    mccutils.run_command(command, log=log)

    out_sam = out_dir+"teflon.sam"

    command = [
        "bwa", "mem",
        "-t", str(threads),
        "-Y", reference_genome,
        fq1, 
        fq2
    ]

    mccutils.run_command_stdout(command, out_sam, log=log)

    out_bam = out_dir+"teflon.bam"
    command = ["samtools", "view", "-Sb", out_sam]
    mccutils.run_command_stdout(command, out_bam, log=log)

    sorted_bam = out_dir+"teflon.sorted.bam"
    command = ["samtools", "sort", "-@", str(threads), "-o", sorted_bam, out_bam]
    mccutils.run_command(command, log=log)

    command = ["samtools", "index", sorted_bam ]
    mccutils.run_command(command, log=log)

    mccutils.remove(out_sam)
    mccutils.remove(out_bam)

    return sorted_bam
Exemple #4
0
def make_nonte_bed(reference, masked_gff, run_id, out, log):
    mccutils.log("coverage", "creating BED file of non-TE regions", log=log)
    masked_bed = out + "/input/" + run_id + "_ref_tes.bed"
    repeatmasker_gff_to_bed(masked_gff, masked_bed)

    sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed"
    mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed],
                                sorted_bed,
                                log=log)

    chromosome_names = []
    with open(reference, "r") as fa:
        for line in fa:
            if ">" in line:
                chromosome_names.append(
                    line.replace(">", "").replace("\n", ""))

    chrom_idx = out + "/input/" + run_id + "_ref.genome"
    with open(reference + ".fai", "r") as faidx:
        with open(chrom_idx, "w") as genome:
            for line in faidx:
                split_line = line.split("\t")
                out_line = "\t".join([split_line[0], split_line[1]])
                genome.write(out_line + "\n")

    non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed"
    command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx]
    mccutils.run_command_stdout(command, non_te_bed, log=log)

    for f in [masked_bed, sorted_bed, chrom_idx]:
        mccutils.remove(f)

    return non_te_bed
Exemple #5
0
def main():
    mccutils.log("processing", "making PopoolationTE reference fasta")
    command = [
        "cat", snakemake.input[0], snakemake.input[1], snakemake.input[2]
    ]
    mccutils.run_command_stdout(command, snakemake.output[0])
    mccutils.log("processing", "PopoolationTE reference fasta created")
def map_reads(ref, fq1, fq2, out, threads=1, log=None):
    mccutils.log("popoolationte2", "mapping reads", log=log)
    sam = out + "/" + "mapped.sam"
    mccutils.run_command_stdout(
        ["bwa", "bwasw", "-t",
         str(threads), ref, fq1, fq2], sam, log=log)
    return sam
Exemple #7
0
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log):
    insertions = []
    tmp_gff = out+"/tmp.ref_nonabs.gff"
    command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed]
    mccutils.run_command_stdout(command, tmp_gff, log=log)

    with open(tmp_gff,"r") as gff:
        for line in gff:
            if "#" not in line:
                line = line.replace(";","\t")
                split_line = line.split("\t")
                insert = mccutils.Insertion()
                insert.chromosome = split_line[0]
                insert.start = int(split_line[3])
                insert.end = int(split_line[4])
                insert.temp.support = "!"
                insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_"
                insert.strand = split_line[6]
                insert.temp.classification = "!"
                insert.temp.junction1Support = "!"
                insert.temp.junction2Support = "!"
                insert.temp.junction1 = '!'
                insert.temp.junction2 = "!"
                insert.temp.frequency = "!"
                insert.type = "reference"
                
                insertions.append(insert)
    
    mccutils.remove(tmp_gff)

    return insertions
Exemple #8
0
def make_nonredundant_bed(insertions, sample_name, out_dir):
    uniq_inserts = {}

    for insert in insertions:
        key = "_".join([insert.chromosome, str(insert.end)])
        if key not in uniq_inserts.keys():
            uniq_inserts[key] = insert
        else:
            if uniq_inserts[key].read_pair_support >  insert.read_pair_support:
                uniq_inserts[key] = insert
    
    tmp_bed = out_dir+"/tmp_telocate_nonredundant.bed"
    with open(tmp_bed, "w") as outbed:
        for key in uniq_inserts.keys():
            insert = uniq_inserts[key]
            out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand])
            outbed.write(out_line+"\n")
    
    sorted_bed = out_dir+"/sorted.bed"
    command = ["bedtools", "sort", "-i", tmp_bed]
    mccutils.run_command_stdout(command, sorted_bed)

    nonredundant_bed = out_dir+"/"+sample_name+"_telocate_nonredundant.bed"
    with open(sorted_bed, "r") as inbed:
        with open(nonredundant_bed, "w") as outbed:
            header = 'track name="'+sample_name+'_TE-locate" description="'+sample_name+'_TE-locate"\n'
            outbed.write(header)
            for line in inbed:
                outbed.write(line)
    

    mccutils.remove(tmp_bed)
    mccutils.remove(sorted_bed)
def sort_bam(bam, threads=1, log=None):
    sorted_bam = bam.split(".")
    sorted_bam[-1] = "sorted.bam"
    sorted_bam = ".".join(sorted_bam)
    command = ["samtools", "sort", bam, "-@", str(threads), "-o", sorted_bam]
    mccutils.run_command_stdout(command, sorted_bam, log=log)

    return sorted_bam
def map_reads(fq, fasta, threads=1, log=None):
    outfile = fq.split(".")
    outfile[-1] = "sam"
    outfile = ".".join(outfile)

    command = ["bwa", "bwasw", "-t", str(threads), fasta, fq]
    mccutils.run_command_stdout(command, outfile, log=log)

    return outfile
def bam_to_sam(bam, threads=1, log=None):
    sam = bam.split(".")
    sam[-1] = "sam"
    sam = ".".join(sam)

    command = ["samtools", "view", "-@", str(threads), bam]
    mccutils.run_command_stdout(command, sam, log=log)

    return sam
def sam_to_bam(sam, threads=1, log=None):
    bam = sam.split(".")
    bam[-1] = "bam"
    bam = ".".join(bam)

    command = ["samtools", "view", "-Sb", "-@", str(threads), sam]
    mccutils.run_command_stdout(command, bam, log=log)

    return bam
Exemple #13
0
def make_depth_table(te_fasta, bam, genome_depth, run_id, out, depth_csv, log, trim_edges=0):
    mccutils.log("coverage","creating TE depth coverage table", log=log)
    with open(depth_csv, "w") as table:
            table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth"+"\n")
    
    te_names = []
    uniq_coverage_files = []
    all_coverage_files = []
    avg_norm_depths = []
    avg_uniq_norm_depths = []

    with open(te_fasta,"r") as fa:
        for line in fa:
            if ">" in line:
                te_name = line.replace("\n","")
                te_name = te_name.replace(">","")

                mccutils.mkdir(out+"/te-depth-files")
                highQ = out+"/te-depth-files/"+te_name+".highQ.cov"
                command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "1"]
                mccutils.run_command_stdout(command, highQ, log=log)

                allQ = out+"/te-depth-files/"+te_name+".allQ.cov"
                command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "0"]
                mccutils.run_command_stdout(command, allQ, log=log)

                # make normalized coverage files
                allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ)
                with open(out+"/te-depth-files/"+te_name+".allQ.normalized.cov","w") as covfile:
                    for i,pos in enumerate(allQ_pos):
                        cov = str(round(allQ_cov[i]/genome_depth,2))
                        line = "\t".join([allQ_chrom,str(pos),cov])
                        covfile.write(line+"\n")
                
                highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file(highQ)
                with open(out+"/te-depth-files/"+te_name+".highQ.normalized.cov","w") as covfile:
                    for i,pos in enumerate(highQ_pos):
                        cov = str(round(highQ_cov[i]/genome_depth,2))
                        line = "\t".join([highQ_chrom,str(pos),cov])
                        covfile.write(line+"\n")

                avg_depth = get_avg_depth(allQ, trim_edges=trim_edges)
                avg_norm_depth = avg_depth/genome_depth

                avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges)
                avg_uniq_norm_depth = avg_uniq_depth/genome_depth

                with open(depth_csv, "a") as table:
                    table.write(te_name+","+str(round(avg_norm_depth,2))+","+str(round(avg_uniq_norm_depth,2))+"\n")
    
                te_names.append(te_name)
                uniq_coverage_files.append(highQ)
                all_coverage_files.append(allQ)
                avg_norm_depths.append(avg_norm_depth)
    
    return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
Exemple #14
0
def get_genome_depth(non_te_bed, bam, run_id, out, log):
    mccutils.log("coverage","determining the coverage depth of the genome", log=log)
    depth_file = out+"/input/"+run_id+"genome.depth"
    command = ["samtools", "depth", "-aa", "-b", non_te_bed, bam, "-d", "0"]
    mccutils.run_command_stdout(command, depth_file, log=log)

    genome_depth = get_avg_depth(depth_file)

    mccutils.remove(depth_file)

    return genome_depth
Exemple #15
0
def map_reads(reference, fq1, threads, sample_name, run_id, out, log, fq2=None):
    mccutils.log("coverage","mapping reads to augmented reference genome", log=log)
    command = ["bwa", "mem", "-t", str(threads), "-R", "@RG\\tID:"+sample_name+"\\tSM:"+sample_name, reference, fq1]

    if fq2 is not None:
        command.append(fq2)
    
    sam = out+"/input/"+run_id+"_"+sample_name+".sam"
    mccutils.run_command_stdout(command, sam, log=log)

    return sam
Exemple #16
0
def main():
    log = snakemake.params.log
    mccutils.log("processing",
                 "sorting SAM file for compatibility with TE-locate",
                 log=log)
    command = [
        "sort", "-S", "1G",
        "--temporary-directory=" + snakemake.config['args']['out'] + "/tmp",
        snakemake.input[0]
    ]
    mccutils.run_command_stdout(command, snakemake.output[0], log=log)
    mccutils.log("processing", "TE-locate SAM created")
Exemple #17
0
def main():
    log = snakemake.params.log
    mccutils.log("processing","Converting sam to bam", log=log)

    try:
        command = ["samtools","view", "-@", str(snakemake.threads), "-Sb", "-t", snakemake.input.ref_idx, snakemake.input.sam]
        mccutils.run_command_stdout(command, snakemake.output.tmp_bam, log=log)
        mccutils.check_file_exists(snakemake.output.tmp_bam)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...unable convert sam to bam using SAMtools...sam file:", snakemake.input.sam, file=sys.stderr)
        sys.exit(1)


    try:
        command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.bam.replace(".bam", "")]
        mccutils.run_command(command, log=log)
        mccutils.check_file_exists(snakemake.output.bam)
    
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...falied to sort the bam file using samtools sort...bam file:", snakemake.output.tmp_bam, file=sys.stderr)
        sys.exit(1)

    try:
        command = ["samtools", "index", snakemake.output.bam]
        mccutils.run_command(command, log=log)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...falied to index the bam file using samtools index...bam file:", snakemake.output.bam, file=sys.stderr)
        sys.exit(1)


    try:
        command = ["samtools", "flagstat", snakemake.output.bam]
        mccutils.run_command_stdout(command, snakemake.output.flagstat, log=log)
        mccutils.check_file_exists(snakemake.output.flagstat)
    
    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...falied to generate flagstat file using samtools flagstat...bam file:", snakemake.output.bam, file=sys.stderr)
        sys.exit(1)
    
    mccutils.log("processing","sam to bam converted")
Exemple #18
0
def make_redundant_bed(insertions, sample_name, out_dir):
    tmp_bed = out_dir + "/tmp.bed"

    insertion_dict = {}
    out_inserts = []
    for insert in insertions:
        insertion_dict["_".join([
            insert.chromosome,
            str(insert.start - 1),
            str(insert.end), insert.name, "0", insert.strand
        ])] = insert

    with open(tmp_bed, "w") as out:
        for insert in insertions:
            out_line = "\t".join([
                insert.chromosome,
                str(insert.start - 1),
                str(insert.end), insert.name, "0", insert.strand
            ])
            out.write(out_line + "\n")

    sorted_bed = out_dir + "/sorted.bed"
    command = ["bedtools", "sort", "-i", tmp_bed]
    mccutils.run_command_stdout(command, sorted_bed)

    redundant_bed = out_dir + "/" + sample_name + "_relocate2_redundant.bed"
    with open(redundant_bed, "w") as outbed:
        header = 'track name="' + sample_name + '_RelocaTE2" description="' + sample_name + '_RelocaTE2"\n'
        outbed.write(header)
        with open(sorted_bed, "r") as inbed:
            for x, line in enumerate(inbed):

                # outputs inserts in sorted order with unique number added to name
                key = line.replace("\t", "_")
                key = key.replace("\n", "")
                insert = insertion_dict[key]
                insert.name += str(x + 1)
                out_inserts.append(insert)

                # write to bed with unique number added to name
                split_line = line.split("\t")
                split_line[3] += str(x + 1)
                line = "\t".join(split_line)
                outbed.write(line)

    mccutils.remove(tmp_bed)
    mccutils.remove(sorted_bed)

    return out_inserts
Exemple #19
0
def make_copies(fq1, fq2, fq1copy, fq2copy):
    if "gz" in fq1.split(".")[-1]:
        mccutils.run_command_stdout(["zcat", fq1], fq1copy)
    else:
        mccutils.run_command(["cp", fq1, fq1copy])

    if fq2 == "None":
        mccutils.run_command(["touch", fq2copy])

    elif "gz" in fq2.split(".")[-1]:
        mccutils.run_command_stdout(["zcat", fq2], fq2copy)

    else:
        mccutils.run_command(["cp", fq2, fq2copy])

    return fq1copy, fq2copy
Exemple #20
0
def sam_to_bam(sam, reference, sample_name, threads, run_id, out, log):
    mccutils.log("coverage","converting SAM to BAM, and indexing", log=log)
    threads = str(threads)
    tmp_bam = out+"/input/"+run_id+"_tmp.bam"
    command = ["samtools", "view", "-Sb", "-@", threads, "-t", reference+".fai", sam]
    mccutils.run_command_stdout(command, tmp_bam, log=log)

    sorted_bam = out+"/input/"+run_id+"_"+sample_name+".bam"
    command = ["samtools", "sort", "-@", threads, tmp_bam]
    mccutils.run_command_stdout(command, sorted_bam, log=log)

    mccutils.run_command(["samtools", "index", sorted_bam], log=log)

    mccutils.remove(tmp_bam)

    return sorted_bam
Exemple #21
0
def main():
    mccutils.log("processing",
                 "mapping reads to reference",
                 log=snakemake.log[0])

    try:
        command = ["bwa", "mem"]
        if eval(snakemake.config['args']['save_comments']):
            command.append("-C")

        command += [
            "-t",
            str(snakemake.threads), "-R", "@RG\\tID:" +
            snakemake.params.sample + "\\tSM:" + snakemake.params.sample,
            snakemake.input.ref, snakemake.input.fq1
        ]

        if snakemake.config['in']['fq2'] != "None":
            command.append(snakemake.input.fq2)

        mccutils.run_command_stdout(command,
                                    snakemake.output[0],
                                    log=snakemake.log[0])

        mccutils.check_file_exists(snakemake.output[0])

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        if snakemake.config['in']['fq2'] == "None":
            print(
                "ERROR...unable to map reads (bwa mem) using reference fasta:",
                snakemake.input.ref,
                "and reads:",
                snakemake.input.fq1,
                file=sys.stderr)
        else:
            print(
                "ERROR...unable to map reads (bwa mem) using reference fasta:",
                snakemake.input.ref,
                "and reads:",
                snakemake.input.fq1,
                snakemake.input.fq2,
                file=sys.stderr)
        sys.exit(1)

    mccutils.log("processing", "read mapping complete")
def get_avg_coverage(ref, bam, out):
    chrom = []
    fasta_records = SeqIO.parse(ref, "fasta")
    for record in fasta_records:
        chrom.append(str(record.id))

    tmp = out + "/tmp"
    command = ['samtools', 'depth', bam]
    mccutils.run_command_stdout(command, tmp)

    cov_total = 0
    pos = 0
    with open(tmp, "r") as depth:
        for line in depth:
            split_line = line.split("\t")
            if split_line[0] in chrom:
                pos += 1
                cov_total += int(split_line[2])

    mccutils.remove(tmp)
    return round(cov_total / pos, 3)
Exemple #23
0
def make_run_config(args, sample_name, ref_name, full_command,
                    current_directory):
    run_id = random.randint(1000000, 9999999)
    mccutils.mkdir(args.out + "/snakemake")
    mccutils.mkdir(args.out + "/snakemake/config")
    run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json"
    input_dir = args.out + "/method_input/"
    results_dir = args.out + "/results/"

    mcc_path = os.path.dirname(os.path.abspath(__file__))

    # get git commit hash to provide in summary report
    git_commit = "?"
    try:
        os.chdir(mcc_path)
        git_commit_file = args.out + "/git-commit.txt"
        mccutils.run_command_stdout(["git", "rev-parse", "HEAD"],
                                    git_commit_file)
        with open(git_commit_file, "r") as inf:
            for line in inf:
                git_commit = line.replace("\n", "")

        mccutils.remove(git_commit_file)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("Could not locate git commit hash...using '?' ", file=sys.stderr)
        git_commit = "?"

    mccutils.log("SETUP", "McClintock Version: " + git_commit)

    out_files_to_make = []
    out_files = config.OUT_PATHS
    for key in out_files.keys():
        out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir)
        out_files[key] = out_files[key].replace(config.RESULTS_DIR,
                                                results_dir)
        out_files[key] = out_files[key].replace(config.SAMPLE_NAME,
                                                sample_name)

    for method in args.methods:
        out_files_to_make.append(out_files[method])

    now = datetime.now()
    now_str = now.strftime("%Y%m%d.%H%M%S")
    log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/"
    mccutils.mkdir(log_dir)

    chromosomes = []
    for record in SeqIO.parse(args.reference, "fasta"):
        chrom = str(record.id)
        chrom = mccutils.replace_special_chars(chrom)
        chromosomes.append(chrom)

    data = {}
    data['args'] = {
        'proc':
        str(args.proc),
        'out':
        str(args.out),
        'log_dir':
        log_dir,
        'augment_fasta':
        str(args.augment),
        'mcc_path':
        mcc_path,
        'commit':
        git_commit,
        'sample_name':
        sample_name,
        'ref_name':
        ref_name,
        'run_id':
        str(run_id),
        'methods':
        ",".join(args.methods),
        'out_files':
        ",".join(out_files_to_make),
        'save_comments':
        str(args.comments),
        'max_threads_per_rule':
        max(
            1,
            calculate_max_threads(args.proc,
                                  args.methods,
                                  config.MULTI_THREAD_METHODS,
                                  slow=args.slow)),
        'full_command':
        full_command,
        'call_directory':
        current_directory,
        'time':
        now.strftime("%Y-%m-%d %H:%M:%S"),
        "chromosomes":
        ",".join(chromosomes)
    }

    # input paths for files
    data["in"] = {
        'reference': str(args.reference),
        'consensus': str(args.consensus),
        'fq1': str(args.first),
        'fq2': str(args.second),
        'locations': str(args.locations),
        'taxonomy': str(args.taxonomy),
        'coverage_fasta': str(args.coverage_fasta),
    }

    # where mcc copies will be stored

    data["mcc"] = config.INTERMEDIATE_PATHS
    for key in data["mcc"].keys():
        data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR,
                                                    input_dir)
        data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name)
        data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME,
                                                    sample_name)

    env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/"
    data["envs"] = config_install.ENV
    for key in data["envs"].keys():
        data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH,
                                                      env_path)

    with open(run_config, "w") as conf:
        json.dump(data, conf, indent=4)

    return run_id
Exemple #24
0
def run_popoolationte(sam,
                      reference,
                      taxon,
                      read_len,
                      insert_size,
                      max_dist,
                      ref_inserts,
                      script_dir,
                      out_dir,
                      log=None,
                      identify_min_count=3,
                      identify_min_qual=15,
                      crosslink_site_shift=100,
                      update_te_inserts_site_shift=100,
                      estimate_polymorphism_min_qual=15,
                      filter_min_count=5):

    mccutils.log("popoolationte", "identify-te-insertsites.pl")
    insert_sites = out_dir + "te-fwd-rev.txt"
    command = [
        "perl", script_dir + "identify-te-insertsites.pl", "--input", sam,
        "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family",
        "--narrow-range",
        str(read_len), "--min-count",
        str(identify_min_count), "--min-map-qual",
        str(identify_min_qual), "--output", insert_sites, "--insert-distance",
        str(insert_size), "--read-length",
        str(read_len)
    ]
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "genomic-N-2gtf.pl")
    poly_n = out_dir + "poly_n.gtf"
    command = ["perl", script_dir + "genomic-N-2gtf.pl", "--input", reference]
    mccutils.run_command_stdout(command, poly_n, log=log)

    mccutils.log("popoolationte", "crosslink-te-sites.pl")
    crosslinked = out_dir + "te-inserts.txt"
    command = [
        "perl", script_dir + "crosslink-te-sites.pl",
        "--directional-insertions", insert_sites, "--min-dist",
        str(read_len), "--max-dist",
        str(max_dist), "--output", crosslinked, "--single-site-shift",
        str(crosslink_site_shift), "--poly-n", poly_n, "--te-hierarchy", taxon,
        "--te-hier-level", "family"
    ]
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "update-teinserts-with-knowntes.pl")
    updated_inserts = out_dir + "te-insertions-updated.txt"
    command = [
        "perl", script_dir + "update-teinserts-with-knowntes.pl", "--known",
        ref_inserts, "--output", updated_inserts, "--te-hierarchy-file", taxon,
        "--te-hierarchy-level", "family", "--max-dist",
        str(max_dist), "--te-insertions", crosslinked, "--single-site-shift",
        str(update_te_inserts_site_shift)
    ]
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "estimate-polymorphism.pl")
    te_polymorphism = out_dir + "te-polymorphism"
    command = [
        "perl", script_dir + "estimate-polymorphism.pl", "--sam-file", sam,
        "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon,
        "--te-hierarchy-level", "family", "--min-map-qual",
        str(estimate_polymorphism_min_qual), "--output", te_polymorphism
    ]
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "filter-teinserts.pl")
    filtered = out_dir + "te-poly-filtered.txt"
    command = [
        "perl", script_dir + "filter-teinserts.pl", "--te-insertions",
        te_polymorphism, "--output", filtered, "--discard-overlapping",
        "--min-count",
        str(filter_min_count)
    ]
    mccutils.run_command(command, log=log)
Exemple #25
0
def main():
    fq1 = snakemake.input.fq1
    fq2 = snakemake.params.fq2
    methods = snakemake.params.methods.split(",")
    processors = snakemake.threads
    mcc_out = snakemake.params.out
    run_id = snakemake.params.run_id
    log = snakemake.params.log

    mccutils.log("processing", "prepping reads for McClintock")
    try:
        # trims adaptors of input fastq(s)
        trimmedfq = fq1
        trimmedfq2 = fq2
        if "trimgalore" in methods:
            mccutils.log("processing", "running trim_galore", log=log)
            if fq2 == "None":
                flags = trimgalore.SINGLE_END_FLAGS
                trimmedfq = run_trim_galore(fq1,
                                            run_id,
                                            log,
                                            mcc_out,
                                            cores=processors,
                                            flags=flags)
            else:
                flags = trimgalore.PAIRED_END_FLAGS
                trimmedfq, trimmedfq2 = run_trim_galore(fq1,
                                                        run_id,
                                                        log,
                                                        mcc_out,
                                                        fq2=fq2,
                                                        cores=processors,
                                                        flags=flags)

        # make unzipped copies in mcc input dir
        if "gz" in trimmedfq.split(".")[-1]:
            mccutils.run_command_stdout(["zcat", trimmedfq],
                                        snakemake.output[0])
        else:
            mccutils.run_command(["cp", trimmedfq, snakemake.output[0]])

        if trimmedfq2 == "None":
            mccutils.run_command(["touch", snakemake.output[1]])

        elif "gz" in trimmedfq2.split(".")[-1]:
            mccutils.run_command_stdout(["zcat", trimmedfq2],
                                        snakemake.output[1])

        else:
            mccutils.run_command(["cp", trimmedfq2, snakemake.output[1]])

        # removes trimmed read files from trimgalore directory
        if trimmedfq != fq1:
            mccutils.remove(trimmedfq)
        if trimmedfq2 != fq2:
            mccutils.remove(trimmedfq2)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "ERROR processing of FastQ files failed...check that your FastQ files are formatted correctly...Exiting...",
            file=sys.stderr)
        mccutils.remove(snakemake.output[0])
        mccutils.remove(snakemake.output[1])
        sys.exit(1)

    mccutils.log("processing", "read setup complete")
def run_popoolationte(sam,
                      reference,
                      taxon,
                      read_len,
                      insert_size,
                      max_dist,
                      ref_inserts,
                      script_dir,
                      out_dir,
                      params,
                      log=None):

    mccutils.log("popoolationte", "identify-te-insertsites.pl")
    insert_sites = out_dir + "te-fwd-rev.txt"
    command = [
        "perl", script_dir + "identify-te-insertsites.pl", "--input", sam,
        "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family",
        "--narrow-range",
        str(read_len), "--output", insert_sites, "--insert-distance",
        str(insert_size), "--read-length",
        str(read_len)
    ]

    for param in params["identify-te-insertsites.pl"].keys():
        command.append(param)
        command.append(str(params["identify-te-insertsites.pl"][param]))
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "genomic-N-2gtf.pl")
    poly_n = out_dir + "poly_n.gtf"
    command = ["perl", script_dir + "genomic-N-2gtf.pl", "--input", reference]
    mccutils.run_command_stdout(command, poly_n, log=log)

    mccutils.log("popoolationte", "crosslink-te-sites.pl")
    crosslinked = out_dir + "te-inserts.txt"
    command = [
        "perl", script_dir + "crosslink-te-sites.pl",
        "--directional-insertions", insert_sites, "--min-dist",
        str(read_len), "--max-dist",
        str(max_dist), "--output", crosslinked, "--poly-n", poly_n,
        "--te-hierarchy", taxon, "--te-hier-level", "family"
    ]

    for param in params["crosslink-te-sites.pl"].keys():
        command.append(param)
        command.append(str(params["crosslink-te-sites.pl"][param]))
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "update-teinserts-with-knowntes.pl")
    updated_inserts = out_dir + "te-insertions-updated.txt"
    command = [
        "perl", script_dir + "update-teinserts-with-knowntes.pl", "--known",
        ref_inserts, "--output", updated_inserts, "--te-hierarchy-file", taxon,
        "--te-hierarchy-level", "family", "--max-dist",
        str(max_dist), "--te-insertions", crosslinked
    ]

    for param in params["update-teinserts-with-knowntes.pl"].keys():
        command.append(param)
        command.append(str(params["update-teinserts-with-knowntes.pl"][param]))
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "estimate-polymorphism.pl")
    te_polymorphism = out_dir + "te-polymorphism"
    command = [
        "perl", script_dir + "estimate-polymorphism.pl", "--sam-file", sam,
        "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon,
        "--te-hierarchy-level", "family", "--output", te_polymorphism
    ]

    for param in params["estimate-polymorphism.pl"].keys():
        command.append(param)
        command.append(str(params["estimate-polymorphism.pl"][param]))
    mccutils.run_command(command, log=log)

    mccutils.log("popoolationte", "filter-teinserts.pl")
    filtered = out_dir + "te-poly-filtered.txt"
    command = [
        "perl", script_dir + "filter-teinserts.pl", "--te-insertions",
        te_polymorphism, "--output", filtered, "--discard-overlapping"
    ]

    for param in params["filter-teinserts.pl"].keys():
        command.append(param)
        command.append(str(params["filter-teinserts.pl"][param]))
    mccutils.run_command(command, log=log)
def sam_to_bam(sam, bam, threads=1, log=None):
    mccutils.log("popoolationte2", "converting SAM to BAM", log=log)
    mccutils.run_command_stdout(
        ["samtools", "view", "-@",
         str(threads), "-Sb", sam], bam, log=log)
    return bam
Exemple #28
0
def make_nonredundant_bed(insertions,
                          sample,
                          out,
                          log,
                          acceptable_classes=["1p1"],
                          frequency_theshold=0.1):
    unsorted_nonredundant_bed = out + "/" + sample + "_temp_unsorted_nonredundant.bed"

    collaped_insertions = {}

    # collapsing all insterts that share the same chromosome and end position (and pass thresholds)
    for insert in insertions:
        if insert.type == "reference" or (
                insert.classification in acceptable_classes
                and insert.frequency > frequency_theshold):
            if insert.type == "reference":
                # reference TEs are only considered 'redundant' if they share the same start and end
                key = insert.chromosome + "_" + str(insert.start) + "_" + str(
                    insert.end)
            else:
                key = insert.chromosome + "_" + str(insert.end)

            if key not in collaped_insertions.keys():
                collaped_insertions[key] = []

            collaped_insertions[key].append(insert)

    with open(unsorted_nonredundant_bed, "w") as bed:
        for key in collaped_insertions.keys():
            highest_supported_insert = None
            for x, insert in enumerate(collaped_insertions[key]):
                if x < 1:
                    highest_supported_insert = insert
                else:
                    if highest_supported_insert.support != "!" and insert.support > highest_supported_insert.support:
                        highest_supported_insert = insert

            line = "\t".join([
                highest_supported_insert.chromosome,
                str(highest_supported_insert.start),
                str(highest_supported_insert.end),
                highest_supported_insert.name, "0",
                highest_supported_insert.direction
            ])
            bed.write(line + "\n")

    tmp_bed = out + "/" + sample + "_temp_nonredundant.bed.tmp"

    command = ["bedtools", "sort", "-i", unsorted_nonredundant_bed]
    mccutils.run_command_stdout(command, tmp_bed, log=log)

    nonredundant_bed = out + "/" + sample + "_temp_nonredundant.bed"
    with open(nonredundant_bed, "w") as outbed:
        with open(tmp_bed, "r") as inbed:
            header = 'track name="%s_TEMP" description="%s_TEMP"' % (sample,
                                                                     sample)
            outbed.write(header + "\n")
            for line in inbed:
                outbed.write(line)

    mccutils.remove(tmp_bed)
    mccutils.remove(unsorted_nonredundant_bed)
Exemple #29
0
def map_reads(ref, fq, outsam, threads=1, log=None):
    mccutils.log("popoolationte2", "mapping reads", log=log)
    mccutils.run_command_stdout(
        ["bwa", "bwasw", "-t", str(threads), ref, fq], outsam, log=log)
    return outsam