def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig"
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth") } self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": [] } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" }
class TestSeqEditer(unittest.TestCase): def setUp(self): self.example = Example() self.test_folder = "test_folder" self.fasta = os.path.join(self.test_folder, "fasta") if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) os.mkdir(self.fasta) self.seq = SeqEditer() def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_import_data(self): mod_table = os.path.join(self.test_folder, "mod") gen_file(mod_table, self.example.mutation) datas = self.seq._import_data(mod_table) self.assertListEqual( datas, [{ 'ref_id': 'NC_000915.1', 'datas': [{ 'tar_nt': 'c', 'ref_nt': 'a', 'position': '3' }, { 'tar_nt': '-', 'ref_nt': 'a', 'position': '6' }], 'target_id': 'NC_test.1' }, { 'ref_id': 'NC_000915.1', 'datas': [{ 'tar_nt': 'g', 'ref_nt': '-', 'position': '6' }], 'target_id': 'test_case2' }]) def test_modify_seq(self): mod_table = os.path.join(self.test_folder, "mod") gen_file(mod_table, self.example.mutation) gen_file(os.path.join(self.fasta, "NC_000915.1.fa"), self.example.fasta) self.seq.modify_seq(self.fasta, mod_table, self.test_folder) datas = import_data(os.path.join(self.test_folder, "NC_test.1.fa")) self.assertEqual("\n".join(datas), self.example.out_1) datas = import_data(os.path.join(self.test_folder, "test_case2.fa")) self.assertEqual("\n".join(datas), self.example.out_2) def test_modify_header(self): input_file = os.path.join(self.test_folder, "test.fa") gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT") self.seq.modify_header(input_file) datas = import_data(input_file) self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT")
def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = { "tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp") }
def setUp(self): self.example = Example() self.test_folder = "test_folder" self.fasta = os.path.join(self.test_folder, "fasta") if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) os.mkdir(self.fasta) self.seq = SeqEditer()
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if "tmp_tar" in os.listdir(out_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, output, out_folder): new_ref_folder = self.gen_folder(out_folder, ref_files) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("Transfering to target fasta") for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split(",") out = open(filename, "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print( "Error: No fasta information of {0}.fa".format(strain)) out.close() shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) print("Please use the new fasta file to remapping again.")
class TargetFasta(object): def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = { "tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp") } def get_target_fasta(self, mut_table, tar_folder, ref_folder, output): self.multiparser.parser_fasta(ref_folder) if "tmp" in os.listdir(tar_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("transfer to target fasta...") if output is not None: for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split("_and_") out = open(os.path.join(tar_folder, filename + ".fa"), "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error:no fasta information of {0}.fa".format( strain)) out.close() else: self.helper.move_all_content(self.folders["tmp_tar"], tar_folder, [".fa"]) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) self.helper.remove_all_content(ref_folder, "_folder", "dir") print("please use the new fasta file to remapping again.") print("Then copy BAMs and wigs back to input/align_results/BAMs " "and input/align_results/wigs")
class TargetFasta(object): def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp")} def get_target_fasta(self, mut_table, tar_folder, ref_folder, output): self.multiparser.parser_fasta(ref_folder) if "tmp" in os.listdir(tar_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("transfer to target fasta...") if output is not None: for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split("_and_") out = open(os.path.join(tar_folder, filename + ".fa"), "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open(os.path.join( self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error:no fasta information of {0}.fa".format( strain)) out.close() else: self.helper.move_all_content(self.folders["tmp_tar"], tar_folder, [".fa"]) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) self.helper.remove_all_content(ref_folder, "_folder", "dir") print("please use the new fasta file to remapping again.") print("Then copy BAMs and wigs back to input/align_results/BAMs " "and input/align_results/wigs")
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
class TestSeqEditer(unittest.TestCase): def setUp(self): self.example = Example() self.test_folder = "test_folder" self.fasta = os.path.join(self.test_folder, "fasta") if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) os.mkdir(self.fasta) self.seq = SeqEditer() def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_import_data(self): mod_table = os.path.join(self.test_folder, "mod") gen_file(mod_table, self.example.mutation) datas = self.seq._import_data(mod_table) self.assertListEqual(datas, [{'ref_id': 'NC_000915.1', 'datas': [{'tar_nt': 'c', 'ref_nt': 'a', 'position': '3'}, {'tar_nt': '-', 'ref_nt': 'a', 'position': '6'}], 'target_id': 'NC_test.1'}, {'ref_id': 'NC_000915.1', 'datas': [{'tar_nt': 'g', 'ref_nt': '-', 'position': '6'}], 'target_id': 'test_case2'}]) def test_modify_seq(self): mod_table = os.path.join(self.test_folder, "mod") gen_file(mod_table, self.example.mutation) gen_file(os.path.join(self.fasta, "NC_000915.1.fa"), self.example.fasta) self.seq.modify_seq(self.fasta, mod_table, self.test_folder) datas = import_data(os.path.join(self.test_folder, "NC_test.1.fa")) self.assertEqual("\n".join(datas), self.example.out_1) datas = import_data(os.path.join(self.test_folder, "test_case2.fa")) self.assertEqual("\n".join(datas), self.example.out_2) def test_modify_header(self): input_file = os.path.join(self.test_folder, "test.fa") gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT") self.seq.modify_header(input_file) datas = import_data(input_file) self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT")
def deal_detect(input_file, file_path, change, input_folder): if change: shutil.move(input_file, file_path) change = False SeqEditer().modify_header(file_path) with open(os.path.join(file_path)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_name = line[1:] shutil.move(file_path, os.path.join(input_folder, seq_name + ".fa")) return change, seq_name
def deal_detect(input_file, file_path, change, input_folder): '''deal with the header of fasta file and put the files to corresponding folders''' if change: shutil.move(input_file, file_path) change = False SeqEditer().modify_header(file_path) with open(os.path.join(file_path)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_name = line[1:] shutil.move(file_path, os.path.join(input_folder, seq_name + ".fa")) return change, seq_name
class SNPCalling(object): def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _import_bam(self, bam_folder, bams): num_bam = 0 for bam in os.listdir(bam_folder): if bam.endswith(".bam"): num_bam += 1 bams.append(os.path.join(bam_folder, bam)) return num_bam def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_file = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"])) snp_detect(fasta, snp, out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_file, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp): if type_ == "with": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "without": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-B", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "extend": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-E", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) elif args_snp.chrom == "2": call([args_snp.bcftools_path, "call", self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_bcf = open(self.outputs["tmp"], "w") out_vcf = self._run_tools(fasta_file, out_bcf, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) out_bcf.close() def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "1": type_ = "with" print("Running SNP calling with BAQ...") elif index == "2": type_ = "without" print("Running SNP calling without BAQ...") elif index == "3": print("Running SNP calling extend BAQ...") type_ = "extend" else: print("Error: No correct program, please assign 1, 2, 3") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) os.system(command) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.frag_bams is None) and (args_snp.normal_bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: if args_snp.normal_bams is not None: num_normal = self._import_bam(args_snp.normal_bams, bams) if args_snp.frag_bams is not None: num_frag = self._import_bam(args_snp.frag_bams, bams) num_bam = num_normal + num_frag if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merge BAM files now ...") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path): command = " ".join([samtools_path, "view", "-H", self.bams["sort"]]) os.system(">".join([command, self.header])) def _get_genome_name(self, samtools_path): self._get_header(samtools_path) fh = open(self.header, "r") seq_names = [] for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() return seq_names def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp.samtools_path) if ("1" not in args_snp.program) and ( "2" not in args_snp.program) and ( "3" not in args_snp.program): print("Error:Please assign a correct BAQ type: " "'1' means 'with_BAQ', '2' means 'with_BAQ' or " "'3' means 'extend_BAQ'.") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now ...".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = {"raw_prefix": os.path.join( self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join( self.outputs["table"], prefix, prefix)} fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp(args_snp.fastas) os.remove(self.bams["whole"]) os.remove(self.bams["sort"]) os.remove(self.header)
class Multiparser(object): def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig" def combine_fasta(self, ref_folder, tar_folder, ref_feature): tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging fasta file of " + prefix) for file_ in os.listdir("/".join([ref_folder, folder])): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): if tar.endswith(".fa") or \ tar.endswith(".fna") or \ tar.endswith(".fasta"): filename = ".".join((tar.split("."))[:-1]) for file_ in files: if filename == file_: self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_fa)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_fa), os.path.join(tar_merge, prefix + ".fa")) self.helper.remove_all_content(tar_folder, ".fa", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def get_prefix(self, folder, ref_feature): datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] return prefix def combine_wig(self, ref_folder, tar_folder, ref_feature, libs): tar_merge = os.path.join(tar_folder, "merge_tmp") change_f = False change_r = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: prefix = self.get_prefix(folder, ref_feature) print("Merging wig file of " + prefix) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): filename = tar.split("_STRAIN_") for file_ in files: if (tar.endswith(".wig")) and ( file_ == filename[-1][:-4]): for lib in libs: if (filename[0] in lib) and (lib[-1] == "+"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_forward)) change_f = True elif (filename[0] in lib) and (lib[-1] == "-"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_reverse)) change_r = True if change_f and change_r: change_f = False change_r = False shutil.move(os.path.join(tar_folder, self.tmp_wig_forward), os.path.join(tar_merge, prefix + "_forward.wig")) shutil.move(os.path.join(tar_folder, self.tmp_wig_reverse), os.path.join(tar_merge, prefix + "_reverse.wig")) self.helper.remove_all_content(tar_folder, ".wig", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature): tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if tar_feature is None: tar_feature = "" else: tar_feature = "_" + tar_feature if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging gff file of " + prefix + tar_feature) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): for file_ in files: if (".gff" in tar) and ( file_ + tar_feature == tar[:-4]): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_gff)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_gff), os.path.join(tar_folder, "merge_tmp", prefix + tar_feature + ".gff")) self.helper.remove_all_content(tar_folder, ".gff", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def parser_fasta(self, fastas): par_tmp = os.path.join(fastas, "tmp") first = True out = None out_t = None for fasta in os.listdir(fastas): if (fasta.endswith("fasta") or fasta.endswith("fa") or fasta.endswith("fna")): self.seq_editer.modify_header(os.path.join(fastas, fasta)) self.helper.check_make_folder(par_tmp) for fasta in os.listdir(fastas): if ("_folder" not in fasta) and ("tmp" != fasta): if (fasta.endswith(".fa")) or \ (fasta.endswith(".fna")) or \ (fasta.endswith(".fasta")): out_path = os.path.join(fastas, fasta + "_folder") print("Parser " + fasta + "...") self.helper.check_make_folder(out_path) with open(os.path.join(fastas, fasta), "r") as f_f: for line in f_f: if line[0] == ">": line = line.strip() if ("|" in line) and ( len(line.split("|")) > 4): strain = line.split("|") name = strain[3] else: name = line[1:] if first: first = False else: out.close() out_t.close() out = open(os.path.join( out_path, name + ".fa"), "w") out_t = open(os.path.join( par_tmp, name + ".fa"), "w") out.write(">" + name + "\n") out_t.write(">" + name + "\n") else: out.write(line) out_t.write(line) out.close() out_t.close() def parser_gff(self, gff_folder, feature): par_tmp = os.path.join(gff_folder, "tmp") out = None out_t = None first = True if feature is None: feature = "" else: feature = "_" + feature self.helper.check_make_folder(par_tmp) for filename in os.listdir(gff_folder): pre_seq_id = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(gff_folder, filename + "_folder") if ".gff" in filename: print("Parser " + filename + "...") self.helper.check_make_folder(out_path) self.helper.sort_gff(os.path.join(gff_folder, filename), os.path.join(gff_folder, "tmp.gff")) f_h = open(os.path.join(gff_folder, "tmp.gff"), "r") for row in csv.reader(f_h, delimiter="\t"): if row[0].startswith("#"): continue else: if pre_seq_id == row[0]: out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") else: if first: first = False else: out.close() out_t.close() out = open(os.path.join(out_path, row[0] + feature + ".gff"), "w") out_t = open(os.path.join(par_tmp, row[0] + feature + ".gff"), "w") pre_seq_id = row[0] out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") f_h.close() if os.path.exists(os.path.join(gff_folder, "tmp.gff")): os.remove(os.path.join(gff_folder, "tmp.gff")) out.close() out_t.close() def parser_wig(self, wig_folder): par_tmp = os.path.join(wig_folder, "tmp") first = True out = None out_t = None self.helper.check_make_folder(par_tmp) for filename in os.listdir(wig_folder): track_info = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(wig_folder, filename + "_folder") if ".wig" in filename: print("Parser {0}...".format(filename)) self.helper.check_make_folder(out_path) with open(os.path.join(wig_folder, filename), "r") as w_f: for line in w_f: line = line.split(" ") if (line[0] == "track"): track_info = " ".join(line) if (line[0] == "variableStep"): strain = line[1].split("=") if first: first = False else: out.close() out_t.close() out = open("".join([ os.path.join(out_path, filename[:-4]), "_STRAIN_", strain[1], ".wig"]), "w") out_t = open("".join([ os.path.join(wig_folder, "tmp", filename[:-4]), "_STRAIN_", strain[1], ".wig"]), "w") if track_info != "": out.write(track_info) out_t.write(track_info) out.write(" ".join(line)) out_t.write(" ".join(line)) if (line[0] != "track") and ( line[0] != "variableStep"): out.write(" ".join(line)) out_t.write(" ".join(line)) out.close() out_t.close()
def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}
class SNPCalling(object): def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf") } if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam") } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" } def _import_bam(self, bam_folder, bams): num_bam = 0 for bam in os.listdir(bam_folder): if bam.endswith(".bam"): num_bam += 1 bams.append(os.path.join(bam_folder, bam)) return num_bam def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_file = os.path.join( self.stat_path, "_".join(["stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"])) snp_detect(fasta, snp, out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_file, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp): if type_ == "with": call([ args_snp.samtools_path, "mpileup", "-t", "DP", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG" ], stdout=out_bcf) elif type_ == "without": call([ args_snp.samtools_path, "mpileup", "-t", "DP", "-B", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG" ], stdout=out_bcf) elif type_ == "extend": call([ args_snp.samtools_path, "mpileup", "-t", "DP", "-E", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG" ], stdout=out_bcf) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([ args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], "-vmO", "v", "-o", out_vcf ]) elif args_snp.chrom == "2": call([ args_snp.bcftools_path, "call", self.outputs["tmp"], "-vmO", "v", "-o", out_vcf ]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_bcf = open(self.outputs["tmp"], "w") out_vcf = self._run_tools(fasta_file, out_bcf, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) out_bcf.close() def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "1": type_ = "with" print("Running SNP calling with BAQ...") elif index == "2": type_ = "without" print("Running SNP calling without BAQ...") elif index == "3": print("Running SNP calling extend BAQ...") type_ = "extend" else: print("Error: No correct program, please assign 1, 2, 3") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join( [samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([ samtools_path, sub_command, "-o", bam_file, self.bams["whole"] ])) os.system(command) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.frag_bams is None) and (args_snp.normal_bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: if args_snp.normal_bams is not None: num_normal = self._import_bam(args_snp.normal_bams, bams) if args_snp.frag_bams is not None: num_frag = self._import_bam(args_snp.frag_bams, bams) num_bam = num_normal + num_frag if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merge BAM files now ...") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path): command = " ".join([samtools_path, "view", "-H", self.bams["sort"]]) os.system(">".join([command, self.header])) def _get_genome_name(self, samtools_path): self._get_header(samtools_path) fh = open(self.header, "r") seq_names = [] for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() return seq_names def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp.samtools_path) if ("1" not in args_snp.program) and ( "2" not in args_snp.program) and ("3" not in args_snp.program): print("Error:Please assign a correct BAQ type: " "'1' means 'with_BAQ', '2' means 'with_BAQ' or " "'3' means 'extend_BAQ'.") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now ...".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = { "raw_prefix": os.path.join(self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join(self.outputs["table"], prefix, prefix) } fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp(args_snp.fastas) os.remove(self.bams["whole"]) os.remove(self.bams["sort"]) os.remove(self.header)
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_prefix = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_]]), "SNP"])) snp_detect(fasta, snp, self.outputs["depth"], out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_prefix, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _get_para(self, args_snp): bams = self.bams["sort"] if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bams, bcf_para def _run_tools(self, fasta_file, out_raw_prefix, type_, args_snp): bams, bcf_para = self._get_para(args_snp) if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bams] else: command = command + ["--ignore-RG", "-ugf", fasta_file, bams] os.system(" ".join(command) + ">" + self.outputs["tmp"]) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", out_vcf]) elif args_snp.chrom == "2": call([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", out_vcf]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_vcf = self._run_tools(fasta_file, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) os.system(command) self.bams["bams"].append(bam_file.replace(".bam", "_sort.bam")) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: num_bam = 0 for files in args_snp.bams: for bam in glob(files): bams.append(bam) num_bam += 1 if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sorting BAM file now") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merging BAM files now") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sorting BAM file now") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) out_depth = open(self.outputs["depth"], "w") call([args_snp.samtools_path, "index", self.bams["sort"]]) call([args_snp.samtools_path, "depth", self.bams["sort"]], stdout=out_depth) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp): seq_names = [] self._get_header(args_snp.samtools_path, self.bams["sort"], seq_names) return seq_names def _remove_bams(self): if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) if os.path.exists(self.bams["whole"] + ".bai"): os.remove(self.bams["whole"] + ".bai") if os.path.exists(self.bams["sort"]): os.remove(self.bams["sort"]) if os.path.exists(self.bams["sort"] + ".bai"): os.remove(self.bams["sort"] + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"]) def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ( "extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = {"raw_prefix": os.path.join( self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join( self.outputs["table"], prefix, prefix)} fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams()
class Multiparser(object): def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig" def combine_fasta(self, ref_folder, tar_folder, ref_feature): '''combine multiple fasta files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging fasta files of " + prefix) for file_ in os.listdir("/".join([ref_folder, folder])): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): if tar.endswith(".fa") or \ tar.endswith(".fna") or \ tar.endswith(".fasta"): filename = ".".join((tar.split("."))[:-1]) for file_ in files: if filename == file_: self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_fa)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_fa), os.path.join(tar_merge, prefix + ".fa")) self.helper.remove_all_content(tar_folder, ".fa", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def get_prefix(self, folder, ref_feature): datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] return prefix def combine_wig(self, ref_folder, tar_folder, ref_feature, libs): '''combine multiple wig files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change_f = False change_r = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: prefix = self.get_prefix(folder, ref_feature) print("Merging wig files of " + prefix) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): filename = tar.split("_STRAIN_") for file_ in files: if (tar.endswith(".wig")) and (file_ == filename[-1][:-4]): for lib in libs: if (filename[0] in lib) and (lib[-1] == "+"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_forward)) change_f = True elif (filename[0] in lib) and (lib[-1] == "-"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_reverse)) change_r = True if change_f and change_r: change_f = False change_r = False shutil.move( os.path.join(tar_folder, self.tmp_wig_forward), os.path.join(tar_merge, prefix + "_forward.wig")) shutil.move( os.path.join(tar_folder, self.tmp_wig_reverse), os.path.join(tar_merge, prefix + "_reverse.wig")) else: print("Error: comparing input files of {0} failed. " "Please check the seq IDs of all gff and fasta " "files, they should be the same.\nPlease " "also check the wiggle files which should contain " "forward and reverse files.".format(prefix)) sys.exit() self.helper.remove_all_content(tar_folder, ".wig", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature): '''combine multiple gff files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if tar_feature is None: tar_feature = "" else: tar_feature = "_" + tar_feature if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging gff files of " + prefix + tar_feature) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): for file_ in files: if (".gff" in tar) and (file_ + tar_feature == tar[:-4]): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_gff)) change = True if change: change = False shutil.move( os.path.join(tar_folder, self.tmp_gff), os.path.join(tar_folder, "merge_tmp", prefix + tar_feature + ".gff")) self.helper.remove_all_content(tar_folder, ".gff", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def parser_fasta(self, fastas): '''parser the fasta file based on strain''' par_tmp = os.path.join(fastas, "tmp") first = True out = None out_t = None detect = False for fasta in os.listdir(fastas): if (fasta.endswith(".fasta") or fasta.endswith(".fa") or fasta.endswith(".fna")): detect = True self.seq_editer.modify_header(os.path.join(fastas, fasta)) self.helper.check_make_folder(par_tmp) if not detect: print("Error: there are folders which conatin no fasta files! " "The files should end with .fa or .fna or .fasta!") sys.exit() for fasta in os.listdir(fastas): if ("_folder" not in fasta) and ("tmp" != fasta): if (fasta.endswith(".fa")) or \ (fasta.endswith(".fna")) or \ (fasta.endswith(".fasta")): out_path = os.path.join(fastas, fasta + "_folder") print("Parsing " + fasta) self.helper.check_make_folder(out_path) with open(os.path.join(fastas, fasta), "r") as f_f: for line in f_f: if line[0] == ">": line = line.strip() if ("|" in line) and (len(line.split("|")) > 4): strain = line.split("|") name = strain[3] else: name = line[1:] if first: first = False else: out.close() out_t.close() out = open( os.path.join(out_path, name + ".fa"), "w") out_t = open( os.path.join(par_tmp, name + ".fa"), "w") out.write(">" + name + "\n") out_t.write(">" + name + "\n") else: out.write(line) out_t.write(line) if out is not None: out.close() if out_t is not None: out_t.close() def parser_gff(self, gff_folder, feature): '''parser gff file based on strain''' par_tmp = os.path.join(gff_folder, "tmp") out = None out_t = None first = True detect = False if feature is None: feature = "" else: feature = "_" + feature self.helper.check_make_folder(par_tmp) for filename in os.listdir(gff_folder): pre_seq_id = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(gff_folder, filename + "_folder") if ".gff" in filename: detect = True print("Parsing " + filename) self.helper.check_make_folder(out_path) self.helper.sort_gff(os.path.join(gff_folder, filename), os.path.join(gff_folder, "tmp.gff")) f_h = open(os.path.join(gff_folder, "tmp.gff"), "r") for row in csv.reader(f_h, delimiter="\t"): if row[0].startswith("#"): continue else: if pre_seq_id == row[0]: out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") else: if first: first = False else: out.close() out_t.close() out = open( os.path.join(out_path, row[0] + feature + ".gff"), "w") out_t = open( os.path.join(par_tmp, row[0] + feature + ".gff"), "w") pre_seq_id = row[0] out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") f_h.close() if not detect: print("Error: There are folders which contain no gff3 files! " "The files should end with .gff!") sys.exit() if os.path.exists(os.path.join(gff_folder, "tmp.gff")): os.remove(os.path.join(gff_folder, "tmp.gff")) if out is not None: out.close() if out_t is not None: out_t.close() def parser_wig(self, wig_folder): '''parser the wig file based on strain''' par_tmp = os.path.join(wig_folder, "tmp") first = True out = None out_t = None detect = False self.helper.check_make_folder(par_tmp) for filename in os.listdir(wig_folder): track_info = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(wig_folder, filename + "_folder") if ".wig" in filename: detect = True print("Parsing {0}".format(filename)) self.helper.check_make_folder(out_path) with open(os.path.join(wig_folder, filename), "r") as w_f: for line in w_f: line = line.split(" ") if (line[0] == "track"): track_info = " ".join(line) if (line[0] == "variableStep"): strain = line[1].split("=") if first: first = False else: out.close() out_t.close() out = open( "".join([ os.path.join(out_path, filename[:-4]), "_STRAIN_", strain[1], ".wig" ]), "w") out_t = open( "".join([ os.path.join(wig_folder, "tmp", filename[:-4]), "_STRAIN_", strain[1], ".wig" ]), "w") if track_info != "": out.write(track_info) out_t.write(track_info) out.write(" ".join(line)) out_t.write(" ".join(line)) if (line[0] != "track") and (line[0] != "variableStep"): out.write(" ".join(line)) out_t.write(" ".join(line)) if not detect: print("Error: There are folders which contain no wig files! " "The files should end with .wig!") sys.exit() if out is not None: out.close() if out_t is not None: out_t.close()
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if os.path.exists(os.path.join(out_folder, "fasta_files")): shutil.rmtree(os.path.join(out_folder, "fasta_files")) os.mkdir(os.path.join(out_folder, "fasta_files")) if os.path.exists(self.folders["tmp_tar"]): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, out_name, out_folder, log): new_ref_folder = self.gen_folder(out_folder, ref_files) log.write("Running seq_editor.py for updating sequence.\n") self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"], out_name) print("Updating the reference sequences") mh = open(mut_table, "r") pre_strain = None out = None strain_num = 0 for row in csv.reader(mh, delimiter='\t'): if not row[0].startswith("#"): if (pre_strain != row[0]): strain_num = strain_num + 1 tmp_tar_name = "_".join([out_name, row[0]]) + ".fa" fasta = os.path.join(out_folder, "fasta_files", tmp_tar_name) if out is not None: out.close() out = open(fasta, "w") if tmp_tar_name in os.listdir(self.folders["tmp_tar"]): with open(os.path.join( self.folders["tmp_tar"], tmp_tar_name)) as f_h: for line in f_h: out.write(line) else: print("Error: No updated information of {0}.fa".format( row[0])) pre_strain = row[0] out.close() out_seq = out_name + ".fa" if os.path.exists(out_seq): os.remove(out_seq) if strain_num == 1: o_s = open(out_seq, "w") for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): with open(os.path.join( out_folder, "fasta_files", seq)) as t_h: for line in t_h: if len(line) != 0: if line.startswith(">"): o_s.write(">" + out_name + "\n") else: o_s.write(line) os.remove(os.path.join(out_folder, "fasta_files", seq)) o_s.close() else: for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): os.system(" ".join(["cat", os.path.join( out_folder, "fasta_files", seq), ">>", out_seq])) os.remove(os.path.join(out_folder, "fasta_files", seq)) shutil.move(out_seq, os.path.join( out_folder, "fasta_files", out_seq)) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) + " is generated.\n") print("Please use the new fasta files to remapping again.")
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _transcript_snp(self, fasta, out_table_prefix, type_, prefix, bam_datas, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) for bam in bam_datas: stat_prefix = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]), "SNP"])) snp_file = os.path.join(self.outputs["raw"], prefix, "_".join( [prefix, self.baqs[type_], bam["sample"] + ".vcf"])) snp_detect( fasta, snp_file, self.outputs["depth"] + bam["sample"], "_".join([out_table_prefix, bam["sample"]]), os.path.join(seq_path, "_".join([prefix, bam["sample"]])), bam["bam_number"], stat_prefix, args_snp, bam["rep"]) self.helper.move_all_content(table_path, self.fig_path, [".png"]) def _get_para(self, args_snp): if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bcf_para def _run_tools(self, fasta_file, type_, args_snp, bam_datas, log): bcf_para = self._get_para(args_snp) for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bam_file] else: command = command + ["--ignore-RG", "-ugf", fasta_file, bam_file] log.write(" ".join(command) + ">" + self.outputs["tmp"] + "\n") os.system(" ".join(command) + ">" + self.outputs["tmp"]) bam["vcf"] = os.path.join(self.outputs["raw"], "_".join( [self.baqs[type_], bam["sample"] + ".vcf"])) if args_snp.chrom == "1": log.write(" ".join([ args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n") call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) elif args_snp.chrom == "2": log.write(" ".join([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n") call([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(self.outputs["raw"]): log.write("\t" + os.path.join(self.outputs["raw"], file_) + "\n") def _parse_vcf_by_fa(self, args_snp, type_, num_prog, log): seq_names = [] fa_prefixs = [] log.write("Parsing Vcf files by comparing fasta information.\n") for fa in os.listdir(args_snp.fastas): if (fa != "all.fa") and (not fa.endswith(".fai")): with open(os.path.join(args_snp.fastas, fa)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_names.append(line[1:]) fa_prefix = ".".join(fa.split(".")[:-1]) fa_prefixs.append(fa_prefix) vcf_folder = os.path.join( self.outputs["raw"], fa_prefix) if num_prog == 0: self.helper.check_make_folder(vcf_folder) self.helper.check_make_folder(os.path.join( self.outputs["table"], fa_prefix)) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], fa_prefix)) for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): out = open(os.path.join(vcf_folder, "_".join( [fa_prefix, vcf])), "w") with open(os.path.join(self.outputs["raw"], vcf)) as vh: for line in vh: line = line.strip() if line.startswith("#"): out.write(line + "\n") else: if line.split("\t")[0] in seq_names: out.write(line + "\n") out.close() log.write("\t" + os.path.join(vcf_folder, "_".join( [fa_prefix, vcf])) + " is generated.\n") for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): os.remove(os.path.join(self.outputs["raw"], vcf)) return fa_prefixs def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog, log): self._run_tools(all_fasta, type_, args_snp, bam_datas, log) fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog, log) log.write("Running transcript_SNP.py to do statistics, filter SNPs, " "and generate potential sequences.\n") log.write("The following files are generated:\n") for fa_prefix in fa_prefixs: for fasta in os.listdir(args_snp.fastas): if fa_prefix in fasta: fasta_file = os.path.join(args_snp.fastas, fasta) table_path = os.path.join(self.outputs["table"], fa_prefix) table_prefix = os.path.join(table_path, "_".join( [fa_prefix, self.baqs[type_]])) self._transcript_snp( fasta_file, table_prefix, type_, fa_prefix, bam_datas, table_path, args_snp) seq_path = os.path.join(self.seq_path, self.baqs[type_], fa_prefix) for folder in (table_path, self.stat_path, seq_path, self.fig_path): for file_ in os.listdir(folder): if os.path.isfile(os.path.join(folder, file_)): log.write("\t" + os.path.join(folder, file_) + "\n") def _run_program(self, all_fasta, bam_datas, args_snp, log): num_prog = 0 log.write("Running Samtools to mpileup, and using Bcftools to " "call snp.\n") log.write("Please make sure the version of Samtools and Bcftools " "are both at least 1.3.1.\n") for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") log.write("Running SNP calling with BAQ.\n") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") log.write("Running SNP calling without BAQ.\n") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") log.write("Running SNP calling extend BAQ.\n") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") log.write("No valid program can be found, please assign" "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".\n") sys.exit() self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog, log) num_prog += 1 def _run_bam(self, samtools_path, sub_command, bam_file, type_file, log): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": if type_file == "all": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) else: command = (" ".join([samtools_path, sub_command, "-o", bam_file, type_file])) log.write(command + "\n") os.system(command) def _merge_bams(self, args_snp, bam_datas, log): bams = [] num_normal = 0 num_frag = 0 log.write("Using Samtools to merge and sort BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for bam in bam_datas: bam["bam_number"] = 0 out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if len(bam["bams"]) == 1: print("Sorting BAM files of " + bam["sample"]) self._run_bam( args_snp.samtools_path, "sort", out_bam, bam["bams"][0], log) bam["bam_number"] = 1 else: print("Merging BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "merge", " ".join(bam["bams"]), "all", log) print("Sorting BAM files of " + bam["sample"]) self._run_bam( args_snp.samtools_path, "sort", out_bam, "all", log) bam["bam_number"] += 1 if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) out_depth = open(self.outputs["depth"] + bam["sample"], "w") log.write(" ".join([args_snp.samtools_path, "index", out_bam]) + "\n") call([args_snp.samtools_path, "index", out_bam]) log.write(" ".join([args_snp.samtools_path, "depth", out_bam]) + "\n") call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth) out_depth.close() log.write("Done!\n") log.write("The following files are generated:\n") log.write("\t" + self.bams["whole"] + " is temporary generated " "(be deleted afterward).\n") for file_ in os.listdir(args_snp.out_folder): if os.path.isfile(os.path.join(args_snp.out_folder, file_)): log.write("\t" + os.path.join(args_snp.out_folder, file_) + "\n") def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": if row[1].split(":")[1] not in seq_names: seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp, bam_datas): seq_names = [] for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") self._get_header(args_snp.samtools_path, bam_file, seq_names) return seq_names def _remove_bams(self, bam_datas, args_snp): for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if os.path.exists(bam_file): os.remove(bam_file) if os.path.exists(bam_file + ".bai"): os.remove(bam_file + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"] + bam["sample"]) def _extract_bams(self, bams, log): bam_datas = [] for bam in bams: datas = bam.split(":") if len(datas) != 2: log.write("the format of --bam_files is wrong!\n") print("Error: the format of --bam_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: there are some Bam files " "which do not exist!") log.write(file_ + " is not found.\n") sys.exit() bam_datas.append({"sample": datas[0], "rep": len(datas[-1].split(",")), "bams": datas[-1].split(",")}) return bam_datas def _merge_fasta(self, fastas, log): all_fasta = os.path.join(fastas, "all.fa") names = [] out = open(all_fasta, "w") print_ = False for fasta in os.listdir(fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fasta")) or ( fasta.endswith(".fna")): with open(os.path.join(fastas, fasta)) as fh: for line in fh: line = line.strip() if line.startswith(">"): if line not in names: print_ = True names.append(line) else: print_ = False if print_: out.write(line + "\n") log.write(os.path.join(fastas, fasta) + " is loaded.\n") out.close() return all_fasta def run_snp_calling(self, args_snp, log): self._modify_header(args_snp.fastas) all_fasta = self._merge_fasta(args_snp.fastas, log) bam_datas = self._extract_bams(args_snp.bams, log) self._merge_bams(args_snp, bam_datas, log) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ( "extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: print("Detecting mutations now") self._run_program(all_fasta, bam_datas, args_snp, log) os.remove(self.outputs["tmp"]) os.remove(all_fasta) os.remove(all_fasta + ".fai") self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams(bam_datas, args_snp) log.write("Remove all the temporary files.\n")
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth") } self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": [] } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" } def _transcript_snp(self, fasta, out_table_prefix, type_, prefix, bam_datas, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) for bam in bam_datas: stat_prefix = os.path.join( self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]), "SNP" ])) snp_file = os.path.join( self.outputs["raw"], prefix, "_".join([prefix, self.baqs[type_], bam["sample"] + ".vcf"])) snp_detect( fasta, snp_file, self.outputs["depth"] + bam["sample"], "_".join([out_table_prefix, bam["sample"]]), os.path.join(seq_path, "_".join([prefix, bam["sample"]])), bam["bam_number"], stat_prefix, args_snp, bam["rep"]) self.helper.move_all_content(table_path, self.fig_path, [".png"]) def _get_para(self, args_snp): if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bcf_para def _run_tools(self, fasta_file, type_, args_snp, bam_datas): bcf_para = self._get_para(args_snp) for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bam_file] else: command = command + [ "--ignore-RG", "-ugf", fasta_file, bam_file ] os.system(" ".join(command) + ">" + self.outputs["tmp"]) bam["vcf"] = os.path.join( self.outputs["raw"], "_".join([self.baqs[type_], bam["sample"] + ".vcf"])) if args_snp.chrom == "1": call([ args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"] ]) elif args_snp.chrom == "2": call([ args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"] ]) def _parse_vcf_by_fa(self, args_snp, type_, num_prog): seq_names = [] fa_prefixs = [] for fa in os.listdir(args_snp.fastas): if (fa != "all.fa") and (not fa.endswith(".fai")): with open(os.path.join(args_snp.fastas, fa)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_names.append(line[1:]) fa_prefix = ".".join(fa.split(".")[:-1]) fa_prefixs.append(fa_prefix) vcf_folder = os.path.join(self.outputs["raw"], fa_prefix) if num_prog == 0: self.helper.check_make_folder(vcf_folder) self.helper.check_make_folder( os.path.join(self.outputs["table"], fa_prefix)) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], fa_prefix)) for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): out = open( os.path.join(vcf_folder, "_".join([fa_prefix, vcf])), "w") with open(os.path.join(self.outputs["raw"], vcf)) as vh: for line in vh: line = line.strip() if line.startswith("#"): out.write(line + "\n") else: if line.split("\t")[0] in seq_names: out.write(line + "\n") out.close() for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): os.remove(os.path.join(self.outputs["raw"], vcf)) return fa_prefixs def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog): self._run_tools(all_fasta, type_, args_snp, bam_datas) fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog) for fa_prefix in fa_prefixs: for fasta in os.listdir(args_snp.fastas): if fa_prefix in fasta: fasta_file = os.path.join(args_snp.fastas, fasta) table_path = os.path.join(self.outputs["table"], fa_prefix) table_prefix = os.path.join( table_path, "_".join([fa_prefix, self.baqs[type_]])) self._transcript_snp(fasta_file, table_prefix, type_, fa_prefix, bam_datas, table_path, args_snp) def _run_program(self, all_fasta, bam_datas, args_snp): num_prog = 0 for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") sys.exit() self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog) num_prog += 1 def _run_bam(self, samtools_path, sub_command, bam_file, type_file): if sub_command == "merge": command = (" ".join( [samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": if type_file == "all": command = (" ".join([ samtools_path, sub_command, "-o", bam_file, self.bams["whole"] ])) else: command = (" ".join( [samtools_path, sub_command, "-o", bam_file, type_file])) os.system(command) def _merge_bams(self, args_snp, bam_datas): bams = [] num_normal = 0 num_frag = 0 for bam in bam_datas: bam["bam_number"] = 0 out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if len(bam["bams"]) == 1: print("Sorting BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "sort", out_bam, bam["bams"][0]) bam["bam_number"] = 1 else: print("Merging BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "merge", " ".join(bam["bams"]), "all") print("Sorting BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "sort", out_bam, "all") bam["bam_number"] += 1 if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) out_depth = open(self.outputs["depth"] + bam["sample"], "w") call([args_snp.samtools_path, "index", out_bam]) call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth) out_depth.close() def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": if row[1].split(":")[1] not in seq_names: seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp, bam_datas): seq_names = [] for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") self._get_header(args_snp.samtools_path, bam_file, seq_names) return seq_names def _remove_bams(self, bam_datas, args_snp): for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if os.path.exists(bam_file): os.remove(bam_file) if os.path.exists(bam_file + ".bai"): os.remove(bam_file + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"] + bam["sample"]) def _extract_bams(self, bams): bam_datas = [] for bam in bams: datas = bam.split(":") if len(datas) != 2: print("Error: the format of --bam_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: there are some Bam files " "which do not exist!") sys.exit() bam_datas.append({ "sample": datas[0], "rep": len(datas[-1].split(",")), "bams": datas[-1].split(",") }) return bam_datas def _merge_fasta(self, fastas): all_fasta = os.path.join(fastas, "all.fa") names = [] out = open(all_fasta, "w") print_ = False for fasta in os.listdir(fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fasta")) or ( fasta.endswith(".fna")): with open(os.path.join(fastas, fasta)) as fh: for line in fh: line = line.strip() if line.startswith(">"): if line not in names: print_ = True names.append(line) else: print_ = False if print_: out.write(line + "\n") out.close() return all_fasta def run_snp_calling(self, args_snp): self._modify_header(args_snp.fastas) all_fasta = self._merge_fasta(args_snp.fastas) bam_datas = self._extract_bams(args_snp.bams) self._merge_bams(args_snp, bam_datas) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ("extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: print("Detecting mutations now") self._run_program(all_fasta, bam_datas, args_snp) os.remove(self.outputs["tmp"]) os.remove(all_fasta) os.remove(all_fasta + ".fai") self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams(bam_datas, args_snp)
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if os.path.exists(self.folders["tmp_tar"]): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, combine, out_folder): pass new_ref_folder = self.gen_folder(out_folder, ref_files) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("Updating the reference sequences") mh = open(mut_table, "r") pre_strain = None out = None for row in csv.reader(mh, delimiter='\t'): strain = row[1] if not row[0].startswith("#"): if (pre_strain != row[1]): fasta = os.path.join(out_folder, "fasta_files", strain + ".fa") if out is not None: out.close() out = open(fasta, "w") if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error: No fasta information of {0}.fa".format( strain)) out.close() if combine: out_seq = "updated_genomes.fa" if os.path.exists(out_seq): os.remove(out_seq) for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): os.system(" ".join([ "cat", os.path.join(out_folder, "fasta_files", seq), ">>", out_seq ])) os.remove(os.path.join(out_folder, "fasta_files", seq)) shutil.move(out_seq, os.path.join(out_folder, "fasta_files", out_seq)) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) print("Please use the new fasta files to remapping again.")
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if os.path.exists(os.path.join(out_folder, "fasta_files")): shutil.rmtree(os.path.join(out_folder, "fasta_files")) os.mkdir(os.path.join(out_folder, "fasta_files")) if os.path.exists(self.folders["tmp_tar"]): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, out_name, out_folder, log): new_ref_folder = self.gen_folder(out_folder, ref_files) log.write("Running seq_editor.py for updating sequence.\n") self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"], out_name) print("Updating the reference sequences") mh = open(mut_table, "r") pre_strain = None out = None strain_num = 0 for row in csv.reader(mh, delimiter='\t'): if not row[0].startswith("#"): if (pre_strain != row[0]): strain_num = strain_num + 1 tmp_tar_name = "_".join([out_name, row[0]]) + ".fa" fasta = os.path.join(out_folder, "fasta_files", tmp_tar_name) if out is not None: out.close() out = open(fasta, "w") if tmp_tar_name in os.listdir(self.folders["tmp_tar"]): with open( os.path.join(self.folders["tmp_tar"], tmp_tar_name)) as f_h: for line in f_h: out.write(line) else: print("Error: No updated information of {0}.fa".format( row[0])) pre_strain = row[0] out.close() out_seq = out_name + ".fa" if os.path.exists(out_seq): os.remove(out_seq) if strain_num == 1: o_s = open(out_seq, "w") for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): with open(os.path.join(out_folder, "fasta_files", seq)) as t_h: for line in t_h: if len(line) != 0: if line.startswith(">"): o_s.write(">" + out_name + "\n") else: o_s.write(line) os.remove(os.path.join(out_folder, "fasta_files", seq)) o_s.close() else: for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): os.system(" ".join([ "cat", os.path.join(out_folder, "fasta_files", seq), ">>", out_seq ])) os.remove(os.path.join(out_folder, "fasta_files", seq)) shutil.move(out_seq, os.path.join(out_folder, "fasta_files", out_seq)) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) + " is generated.\n") print("Please use the new fasta files to remapping again.")