Beispiel #1
0
def split_consensus_fasta(fasta, ref_name, out):
    elements = []
    out_dir = out+"/split_fasta/"
    mccutils.mkdir(out_dir)
    fasta_records = SeqIO.parse(fasta,"fasta")
    for record in fasta_records:
        fasta_name = str(record.id)
        elements.append(fasta_name)
        special_chars = [";","&","(",")","|","*","?","[","]","~","{","}","<","!","^",'"',"'","\\","$","/"]
        for char in special_chars:
            fasta_name = fasta_name.replace(char,"_")

        tmp_fasta = out_dir+ref_name+"_"+fasta_name+".fasta.tmp"
        with open(tmp_fasta,"w") as outfa:
            outfa.write(">"+str(record.id)+"\n")
            outfa.write(str(record.seq)+"\n")
        
        fasta_lines = fix_fasta.fix_fasta_lines(tmp_fasta, 80)
        out_fasta = out_dir+ref_name+"_"+fasta_name+".fasta"
        with open(out_fasta,"w") as outfa:
            for line in fasta_lines:
                outfa.write(line+"\n")
        
        mccutils.remove(tmp_fasta)
    
    return elements
Beispiel #2
0
def fix_fasta_lines(infasta, outfasta, length=80):
    lines = fix_fasta.fix_fasta_lines(infasta, length)
    with open(outfasta, "w") as fa:
        for line in lines:
            fa.write(line+"\n")
    
    return outfasta
Beispiel #3
0
def main():
    mccutils.log("processing", "making coverage fasta")
    fastas = []
    try:
        length = 80
        if snakemake.params.coverage_fasta == "None":
            mccutils.run_command(["touch", snakemake.output.coverage_fasta])
        else:
            fasta3 = snakemake.params.coverage_fasta
            fastas.append(fasta3)
            lines = fix_fasta.fix_fasta_lines(fasta3, length)
            write_fasta(lines, snakemake.output.coverage_fasta)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print(
            "ERROR...failed to create coverage fasta, check the formatting of :",
            snakemake.params.coverage_fasta,
            file=sys.stderr)
        mccutils.remove(snakemake.output[0])
        mccutils.remove(snakemake.output[1])
        mccutils.remove(snakemake.output[2])
        sys.exit(1)

    mccutils.log("processing", "coverage fasta created")
def repeat_mask(reference, te_fasta, chromosomes, procs, run_id, log, out):
    try:
        outdir = out + "/tmp/repeatmasker_" + run_id
        mccutils.mkdir(outdir)
        os.chdir(outdir)
        command = [
            "RepeatMasker", "-pa",
            str(procs), "-lib", te_fasta, "-dir", outdir, "-s", "-gff",
            "-nolow", "-no_is", reference
        ]
        mccutils.run_command(command, log=log)
        os.chdir(out)

        # RepeatMasker appears to override the custom database names during the ProcessRepeats
        # this step changes them back, more rules may be needed for other reference genomes
        ref_name = os.path.basename(reference)
        repeatmasker_gff = outdir + "/" + ref_name + ".out.gff"
        formatted_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs.gff"
        with open(repeatmasker_gff, "r") as rmgff:
            with open(formatted_ref_tes, "w") as outgff:
                for line in rmgff:
                    if "#" not in line:
                        line = line.replace("McClintock-int", "McClintock")
                        line = line.replace("POGON1", "pogo")
                        split_line = line.split("\t")
                        feats = split_line[8]
                        if split_line[0] in chromosomes:
                            te = feats.split(" ")[1]
                            te = te.replace('"', '').split(":")[1]
                            feats = ";".join(
                                ["ID=" + te, "Name=" + te, "Alias=" + te])
                            split_line[2] = te
                            split_line[8] = feats
                            line = "\t".join(split_line)

                            outgff.write(line + '\n')

        masked_fasta = outdir + "/" + ref_name + ".masked"
        fasta_lines = fix_fasta.fix_fasta_lines(masked_fasta, 80)

        mccutils.check_file_exists(formatted_ref_tes)

    except Exception as e:
        track = traceback.format_exc()
        print(track, file=sys.stderr)
        print("ERROR...Failed to run repeatmasker on: ",
              reference,
              "with lib:",
              te_fasta,
              "check file formatting...exiting...",
              file=sys.stderr)
        sys.exit(1)

    return formatted_ref_tes