Esempio n. 1
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "reference":
         file_type = "compare_reference"
     else:
         file_type = "validate_target"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fasta_path = os.path.join(args_snp.fastas, "tmp")
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_table"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                     "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
     if "whole_reads.bam" in os.listdir(args_snp.out_folder):
         self.helper.remove_all_content(args_snp.out_folder,
                                        "whole_read", "file")
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam"),
                  "bams": []}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
Esempio n. 2
0
 def __init__(self):
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.tmp_fa = "tmp.fa"
     self.tmp_gff = "tmp.gff"
     self.tmp_wig_forward = "tmp_forward.wig"
     self.tmp_wig_reverse = "tmp_reverse.wig"
Esempio n. 3
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "related_genome":
         file_type = "compare_related_and_reference_genomes"
     else:
         file_type = "mutations_of_reference_genomes"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fig_path = os.path.join(self.stat_path, "figs")
     self.helper.check_make_folder(self.fig_path)
     self.outputs = {
         "table": os.path.join(args_snp.out_folder, file_type,
                               "SNP_tables"),
         "raw": os.path.join(args_snp.out_folder, file_type,
                             "SNP_raw_outputs"),
         "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
         "depth": os.path.join(args_snp.out_folder, "tmp_depth")
     }
     self.bams = {
         "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"),
         "sort": os.path.join(args_snp.out_folder,
                              "whole_reads_sorted.bam"),
         "bams": []
     }
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {
         "with": "with_BAQ",
         "without": "without_BAQ",
         "extend": "extend_BAQ"
     }
Esempio n. 4
0
class TestSeqEditer(unittest.TestCase):
    def setUp(self):
        self.example = Example()
        self.test_folder = "test_folder"
        self.fasta = os.path.join(self.test_folder, "fasta")
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)
            os.mkdir(self.fasta)
        self.seq = SeqEditer()

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_import_data(self):
        mod_table = os.path.join(self.test_folder, "mod")
        gen_file(mod_table, self.example.mutation)
        datas = self.seq._import_data(mod_table)
        self.assertListEqual(
            datas, [{
                'ref_id':
                'NC_000915.1',
                'datas': [{
                    'tar_nt': 'c',
                    'ref_nt': 'a',
                    'position': '3'
                }, {
                    'tar_nt': '-',
                    'ref_nt': 'a',
                    'position': '6'
                }],
                'target_id':
                'NC_test.1'
            }, {
                'ref_id': 'NC_000915.1',
                'datas': [{
                    'tar_nt': 'g',
                    'ref_nt': '-',
                    'position': '6'
                }],
                'target_id': 'test_case2'
            }])

    def test_modify_seq(self):
        mod_table = os.path.join(self.test_folder, "mod")
        gen_file(mod_table, self.example.mutation)
        gen_file(os.path.join(self.fasta, "NC_000915.1.fa"),
                 self.example.fasta)
        self.seq.modify_seq(self.fasta, mod_table, self.test_folder)
        datas = import_data(os.path.join(self.test_folder, "NC_test.1.fa"))
        self.assertEqual("\n".join(datas), self.example.out_1)
        datas = import_data(os.path.join(self.test_folder, "test_case2.fa"))
        self.assertEqual("\n".join(datas), self.example.out_2)

    def test_modify_header(self):
        input_file = os.path.join(self.test_folder, "test.fa")
        gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT")
        self.seq.modify_header(input_file)
        datas = import_data(input_file)
        self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT")
 def __init__(self, tar_folder, ref_folder):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.folders = {
         "tmp_tar": os.path.join(tar_folder, "tmp"),
         "tmp_ref": os.path.join(ref_folder, "tmp")
     }
Esempio n. 6
0
 def setUp(self):
     self.example = Example()
     self.test_folder = "test_folder"
     self.fasta = os.path.join(self.test_folder, "fasta")
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)
         os.mkdir(self.fasta)
     self.seq = SeqEditer()
Esempio n. 7
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if "tmp_tar" in os.listdir(out_folder):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files, output,
                         out_folder):
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("Transfering to target fasta")
        for file_ in output:
            first = True
            datas = file_.split(":")
            filename = datas[0]
            strains = datas[1].split(",")
            out = open(filename, "w")
            for strain in strains:
                if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                    if first:
                        first = False
                    else:
                        out.write("\n")
                    with open(
                            os.path.join(self.folders["tmp_tar"],
                                         strain + ".fa")) as f_h:
                        for line in f_h:
                            out.write(line)
                else:
                    print(
                        "Error: No fasta information of {0}.fa".format(strain))
            out.close()
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        print("Please use the new fasta file to remapping again.")
class TargetFasta(object):
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {
            "tmp_tar": os.path.join(tar_folder, "tmp"),
            "tmp_ref": os.path.join(ref_folder, "tmp")
        }

    def get_target_fasta(self, mut_table, tar_folder, ref_folder, output):
        self.multiparser.parser_fasta(ref_folder)
        if "tmp" in os.listdir(tar_folder):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("transfer to target fasta...")
        if output is not None:
            for file_ in output:
                first = True
                datas = file_.split(":")
                filename = datas[0]
                strains = datas[1].split("_and_")
                out = open(os.path.join(tar_folder, filename + ".fa"), "w")
                for strain in strains:
                    if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                        if first:
                            first = False
                        else:
                            out.write("\n")
                        with open(
                                os.path.join(self.folders["tmp_tar"],
                                             strain + ".fa")) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error:no fasta information of {0}.fa".format(
                            strain))
                out.close()
        else:
            self.helper.move_all_content(self.folders["tmp_tar"], tar_folder,
                                         [".fa"])
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        self.helper.remove_all_content(ref_folder, "_folder", "dir")
        print("please use the new fasta file to remapping again.")
        print("Then copy BAMs and wigs back to input/align_results/BAMs "
              "and input/align_results/wigs")
class TargetFasta(object):

    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp"),
                        "tmp_ref": os.path.join(ref_folder, "tmp")}

    def get_target_fasta(self, mut_table, tar_folder, ref_folder, output):
        self.multiparser.parser_fasta(ref_folder)
        if "tmp" in os.listdir(tar_folder):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("transfer to target fasta...")
        if output is not None:
            for file_ in output:
                first = True
                datas = file_.split(":")
                filename = datas[0]
                strains = datas[1].split("_and_")
                out = open(os.path.join(tar_folder, filename + ".fa"), "w")
                for strain in strains:
                    if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                        if first:
                            first = False
                        else:
                            out.write("\n")
                        with open(os.path.join(
                                  self.folders["tmp_tar"],
                                  strain + ".fa")) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error:no fasta information of {0}.fa".format(
                              strain))
                out.close()
        else:
            self.helper.move_all_content(self.folders["tmp_tar"],
                                         tar_folder, [".fa"])
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        self.helper.remove_all_content(ref_folder, "_folder", "dir")
        print("please use the new fasta file to remapping again.")
        print("Then copy BAMs and wigs back to input/align_results/BAMs "
              "and input/align_results/wigs")
Esempio n. 10
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "related_genome":
         file_type = "compare_related_and_reference_genomes"
     else:
         file_type = "mutations_of_reference_genomes"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fig_path = os.path.join(self.stat_path, "figs")
     self.helper.check_make_folder(self.fig_path)
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_tables"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                     "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam"),
                  "bams": []}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
Esempio n. 11
0
 def __init__(self, args_snp):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     if args_snp.types == "reference":
         file_type = "compare_reference"
     else:
         file_type = "validate_target"
     self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
     self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                   "statistics")
     self.fasta_path = os.path.join(args_snp.fastas, "tmp")
     self.outputs = {"table": os.path.join(
                     args_snp.out_folder, file_type, "SNP_table"),
                     "raw": os.path.join(
                     args_snp.out_folder, file_type, "SNP_raw_outputs"),
                     "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")}
     if "whole_reads.bam" in os.listdir(args_snp.out_folder):
         self.helper.remove_all_content(args_snp.out_folder,
                                        "whole_read", "file")
     self.bams = {"whole": os.path.join(args_snp.out_folder,
                                        "whole_reads.bam"),
                  "sort": os.path.join(args_snp.out_folder,
                                       "whole_reads_sorted.bam")}
     self.header = os.path.join(args_snp.out_folder, "header")
     self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                  "extend": "extend_BAQ"}
Esempio n. 12
0
 def __init__(self):
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.tmp_fa = "tmp.fa"
     self.tmp_gff = "tmp.gff"
     self.tmp_wig_forward = "tmp_forward.wig"
     self.tmp_wig_reverse = "tmp_reverse.wig"
Esempio n. 13
0
class TestSeqEditer(unittest.TestCase):

    def setUp(self):
        self.example = Example()
        self.test_folder = "test_folder"
        self.fasta = os.path.join(self.test_folder, "fasta")
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)
            os.mkdir(self.fasta)
        self.seq = SeqEditer()

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_import_data(self):
        mod_table = os.path.join(self.test_folder, "mod")
        gen_file(mod_table, self.example.mutation)
        datas = self.seq._import_data(mod_table)
        self.assertListEqual(datas, [{'ref_id': 'NC_000915.1', 'datas': [{'tar_nt': 'c', 'ref_nt': 'a', 'position': '3'},
                                                                         {'tar_nt': '-', 'ref_nt': 'a', 'position': '6'}],
                                                'target_id': 'NC_test.1'},
                                     {'ref_id': 'NC_000915.1', 'datas': [{'tar_nt': 'g', 'ref_nt': '-', 'position': '6'}],
                                                'target_id': 'test_case2'}])

    def test_modify_seq(self):
        mod_table = os.path.join(self.test_folder, "mod")
        gen_file(mod_table, self.example.mutation)
        gen_file(os.path.join(self.fasta, "NC_000915.1.fa"), self.example.fasta)
        self.seq.modify_seq(self.fasta, mod_table, self.test_folder)
        datas = import_data(os.path.join(self.test_folder, "NC_test.1.fa"))
        self.assertEqual("\n".join(datas), self.example.out_1)
        datas = import_data(os.path.join(self.test_folder, "test_case2.fa"))
        self.assertEqual("\n".join(datas), self.example.out_2)

    def test_modify_header(self):
        input_file = os.path.join(self.test_folder, "test.fa")
        gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT")
        self.seq.modify_header(input_file)
        datas = import_data(input_file)
        self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT")
Esempio n. 14
0
def deal_detect(input_file, file_path, change, input_folder):
    if change:
        shutil.move(input_file, file_path)
        change = False
    SeqEditer().modify_header(file_path)
    with open(os.path.join(file_path)) as fh:
        for line in fh:
            line = line.strip()
            if line.startswith(">"):
                seq_name = line[1:]
    shutil.move(file_path, os.path.join(input_folder, seq_name + ".fa"))
    return change, seq_name
Esempio n. 15
0
def deal_detect(input_file, file_path, change, input_folder):
    '''deal with the header of fasta file and 
    put the files to corresponding folders'''
    if change:
        shutil.move(input_file, file_path)
        change = False
    SeqEditer().modify_header(file_path)
    with open(os.path.join(file_path)) as fh:
        for line in fh:
            line = line.strip()
            if line.startswith(">"):
                seq_name = line[1:]
    shutil.move(file_path, os.path.join(input_folder, seq_name + ".fa"))
    return change, seq_name
Esempio n. 16
0
class SNPCalling(object):

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "reference":
            file_type = "compare_reference"
        else:
            file_type = "validate_target"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fasta_path = os.path.join(args_snp.fastas, "tmp")
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_table"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")}
        if "whole_reads.bam" in os.listdir(args_snp.out_folder):
            self.helper.remove_all_content(args_snp.out_folder,
                                           "whole_read", "file")
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam")}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _import_bam(self, bam_folder, bams):
        num_bam = 0
        for bam in os.listdir(bam_folder):
            if bam.endswith(".bam"):
                num_bam += 1
                bams.append(os.path.join(bam_folder, bam))
        return num_bam

    def _transcript_snp(self, fasta, snp, out_table_prefix, type_,
                        prefix, bam_number, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        stat_file = os.path.join(self.stat_path, "_".join([
            "stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"]))
        snp_detect(fasta, snp, out_table_prefix,
                   os.path.join(seq_path, prefix), bam_number,
                   stat_file, args_snp)
        self.helper.move_all_content(table_path, self.stat_path, [".png"])

    def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp):
        if type_ == "with":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-ugf", fasta_file, self.bams["sort"],
                  "--ignore-RG"], stdout=out_bcf)
        elif type_ == "without":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-B", "-ugf", fasta_file,
                  self.bams["sort"], "--ignore-RG"],
                 stdout=out_bcf)
        elif type_ == "extend":
            call([args_snp.samtools_path, "mpileup",
                  "-t", "DP", "-E", "-ugf", fasta_file,
                  self.bams["sort"], "--ignore-RG"], stdout=out_bcf)
        out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"])
        if args_snp.chrom == "1":
            call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                  self.outputs["tmp"], "-vmO", "v", "-o", out_vcf])
        elif args_snp.chrom == "2":
            call([args_snp.bcftools_path, "call",
                  self.outputs["tmp"], "-vmO", "v", "-o", out_vcf])
        return out_vcf

    def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix,
                 table_path, bam_number):
        out_bcf = open(self.outputs["tmp"], "w")
        out_vcf = self._run_tools(fasta_file, out_bcf,
                                  file_prefixs["raw_prefix"], type_, args_snp)
        self.helper.check_make_folder(
             os.path.join(self.seq_path, self.baqs[type_], prefix))
        self._transcript_snp(
            fasta_file, out_vcf,
            "_".join([file_prefixs["table_prefix"], self.baqs[type_]]),
            type_, prefix, bam_number, table_path, args_snp)
        out_bcf.close()

    def _run_program(self, fasta_file, file_prefixs, prefix, bam_number,
                     table_path, args_snp):
        for index in args_snp.program:
            if index == "1":
                type_ = "with"
                print("Running SNP calling with BAQ...")
            elif index == "2":
                type_ = "without"
                print("Running SNP calling without BAQ...")
            elif index == "3":
                print("Running SNP calling extend BAQ...")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign 1, 2, 3")
                sys.exit()
            self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix,
                          table_path, bam_number)

    def _detect_fasta(self, fasta):
        detect = False
        if fasta.endswith(".fa"):
            prefix = fasta[:-3]
            detect = True
        elif fasta.endswith(".fna"):
            prefix = fasta[:-4]
            detect = True
        elif fasta.endswith(".fasta"):
            prefix = fasta[:-6]
            detect = True
        return (detect, prefix)

    def _run_bam(self, samtools_path, sub_command, bam_file):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            command = (" ".join([samtools_path, sub_command,
                       "-o", bam_file, self.bams["whole"]]))
        os.system(command)

    def _merge_bams(self, args_snp):
        bams = []
        num_normal = 0
        num_frag = 0
        if (args_snp.frag_bams is None) and (args_snp.normal_bams is None):
            print("Error: There is no BAMs folders!!")
            sys.exit()
        else:
            if args_snp.normal_bams is not None:
                num_normal = self._import_bam(args_snp.normal_bams, bams)
            if args_snp.frag_bams is not None:
                num_frag = self._import_bam(args_snp.frag_bams, bams)
        num_bam = num_normal + num_frag
        if num_bam <= 1:
            shutil.copyfile(bams[0], self.bams["whole"])
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        else:
            print("Merge BAM files now ...")
            self._run_bam(args_snp.samtools_path, "merge", " ".join(bams))
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        return num_bam

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path):
        command = " ".join([samtools_path, "view", "-H", self.bams["sort"]])
        os.system(">".join([command, self.header]))

    def _get_genome_name(self, samtools_path):
        self._get_header(samtools_path)
        fh = open(self.header, "r")
        seq_names = []
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                seq_names.append(row[1].split(":")[1])
        fh.close()
        return seq_names

    def run_snp_calling(self, args_snp):
        self.multiparser.parser_fasta(args_snp.fastas)
        self._modify_header(args_snp.fastas)
        bam_number = self._merge_bams(args_snp)
        seq_names = self._get_genome_name(args_snp.samtools_path)
        if ("1" not in args_snp.program) and (
                "2" not in args_snp.program) and (
                "3" not in args_snp.program):
            print("Error:Please assign a correct BAQ type: "
                  "'1' means 'with_BAQ', '2' means 'with_BAQ' or "
                  "'3' means 'extend_BAQ'.")
            sys.exit()
        else:
            for fasta in os.listdir(self.fasta_path):
                if (fasta.split(".f")[0] in seq_names):
                    fasta_datas = self._detect_fasta(fasta)
                    detect = fasta_datas[0]
                    prefix = fasta_datas[1]
                    if detect:
                        detect = False
                        print("Computing {0} now ...".format(fasta))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["table"], prefix))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["raw"], prefix))
                        file_prefixs = {"raw_prefix": os.path.join(
                                        self.outputs["raw"], prefix, prefix),
                                        "table_prefix": os.path.join(
                                        self.outputs["table"], prefix, prefix)}
                        fasta_file = os.path.join(self.fasta_path, fasta)
                        table_path = os.path.join(self.outputs["table"],
                                                  prefix)
                        self._run_program(fasta_file, file_prefixs, prefix,
                                          bam_number, table_path, args_snp)
                        os.remove(self.outputs["tmp"])
        self.helper.remove_tmp(args_snp.fastas)
        os.remove(self.bams["whole"])
        os.remove(self.bams["sort"])
        os.remove(self.header)
Esempio n. 17
0
class Multiparser(object):

    def __init__(self):
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.tmp_fa = "tmp.fa"
        self.tmp_gff = "tmp.gff"
        self.tmp_wig_forward = "tmp_forward.wig"
        self.tmp_wig_reverse = "tmp_reverse.wig"

    def combine_fasta(self, ref_folder, tar_folder, ref_feature):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging fasta file of " + prefix)
                for file_ in os.listdir("/".join([ref_folder, folder])):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    if tar.endswith(".fa") or \
                       tar.endswith(".fna") or \
                       tar.endswith(".fasta"):
                        filename = ".".join((tar.split("."))[:-1])
                        for file_ in files:
                            if filename == file_:
                                self.helper.merge_file(
                                     os.path.join(tar_folder, tar),
                                     os.path.join(tar_folder, self.tmp_fa))
                                change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_fa),
                                os.path.join(tar_merge, prefix + ".fa"))
        self.helper.remove_all_content(tar_folder, ".fa", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def get_prefix(self, folder, ref_feature):
        datas = folder.split("_folder")
        if ref_feature == "":
            prefix = datas[0][:-4]
        elif ref_feature == "_fasta":
            if datas[0].endswith(".fa"):
                prefix = datas[0][:-3]
            elif datas[0].endswith(".fna"):
                prefix = datas[0][:-4]
            elif datas[0].endswith(".fasta"):
                prefix = datas[0][:-6]
        else:
            datas = datas[0][:-4]
            datas = datas.split(ref_feature)
            prefix = datas[0]
        return prefix

    def combine_wig(self, ref_folder, tar_folder, ref_feature, libs):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change_f = False
        change_r = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                prefix = self.get_prefix(folder, ref_feature)
                print("Merging wig file of " + prefix)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    filename = tar.split("_STRAIN_")
                    for file_ in files:
                        if (tar.endswith(".wig")) and (
                                file_ == filename[-1][:-4]):
                            for lib in libs:
                                if (filename[0] in lib) and (lib[-1] == "+"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_forward))
                                    change_f = True
                                elif (filename[0] in lib) and (lib[-1] == "-"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_reverse))
                                change_r = True
                if change_f and change_r:
                    change_f = False
                    change_r = False
                    shutil.move(os.path.join(tar_folder, self.tmp_wig_forward),
                                os.path.join(tar_merge,
                                             prefix + "_forward.wig"))
                    shutil.move(os.path.join(tar_folder, self.tmp_wig_reverse),
                                os.path.join(tar_merge,
                                             prefix + "_reverse.wig"))
        self.helper.remove_all_content(tar_folder, ".wig", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature):
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if tar_feature is None:
            tar_feature = ""
        else:
            tar_feature = "_" + tar_feature
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging gff file of " + prefix + tar_feature)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    for file_ in files:
                        if (".gff" in tar) and (
                                file_ + tar_feature == tar[:-4]):
                            self.helper.merge_file(
                                 os.path.join(tar_folder, tar),
                                 os.path.join(tar_folder, self.tmp_gff))
                            change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_gff),
                                os.path.join(tar_folder, "merge_tmp",
                                prefix + tar_feature + ".gff"))
        self.helper.remove_all_content(tar_folder, ".gff", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def parser_fasta(self, fastas):
        par_tmp = os.path.join(fastas, "tmp")
        first = True
        out = None
        out_t = None
        for fasta in os.listdir(fastas):
            if (fasta.endswith("fasta") or
                    fasta.endswith("fa") or
                    fasta.endswith("fna")):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))
        self.helper.check_make_folder(par_tmp)
        for fasta in os.listdir(fastas):
            if ("_folder" not in fasta) and ("tmp" != fasta):
                if (fasta.endswith(".fa")) or \
                   (fasta.endswith(".fna")) or \
                   (fasta.endswith(".fasta")):
                    out_path = os.path.join(fastas, fasta + "_folder")
                    print("Parser " + fasta + "...")
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(fastas, fasta), "r") as f_f:
                        for line in f_f:
                            if line[0] == ">":
                                line = line.strip()
                                if ("|" in line) and (
                                        len(line.split("|")) > 4):
                                    strain = line.split("|")
                                    name = strain[3]
                                else:
                                    name = line[1:]
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(os.path.join(
                                           out_path, name + ".fa"), "w")
                                out_t = open(os.path.join(
                                             par_tmp, name + ".fa"), "w")
                                out.write(">" + name + "\n")
                                out_t.write(">" + name + "\n")
                            else:
                                out.write(line)
                                out_t.write(line)
        out.close()
        out_t.close()

    def parser_gff(self, gff_folder, feature):
        par_tmp = os.path.join(gff_folder, "tmp")
        out = None
        out_t = None
        first = True
        if feature is None:
            feature = ""
        else:
            feature = "_" + feature
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(gff_folder):
            pre_seq_id = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(gff_folder, filename + "_folder")
                if ".gff" in filename:
                    print("Parser " + filename + "...")
                    self.helper.check_make_folder(out_path)
                    self.helper.sort_gff(os.path.join(gff_folder, filename),
                                         os.path.join(gff_folder, "tmp.gff"))
                    f_h = open(os.path.join(gff_folder, "tmp.gff"), "r")
                    for row in csv.reader(f_h, delimiter="\t"):
                        if row[0].startswith("#"):
                            continue
                        else:
                            if pre_seq_id == row[0]:
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                            else:
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(os.path.join(out_path,
                                           row[0] + feature + ".gff"), "w")
                                out_t = open(os.path.join(par_tmp,
                                             row[0] + feature + ".gff"), "w")
                                pre_seq_id = row[0]
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                    f_h.close()
        if os.path.exists(os.path.join(gff_folder, "tmp.gff")):
            os.remove(os.path.join(gff_folder, "tmp.gff"))
        out.close()
        out_t.close()

    def parser_wig(self, wig_folder):
        par_tmp = os.path.join(wig_folder, "tmp")
        first = True
        out = None
        out_t = None
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(wig_folder):
            track_info = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(wig_folder, filename + "_folder")
                if ".wig" in filename:
                    print("Parser {0}...".format(filename))
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(wig_folder, filename), "r") as w_f:
                        for line in w_f:
                            line = line.split(" ")
                            if (line[0] == "track"):
                                track_info = " ".join(line)
                            if (line[0] == "variableStep"):
                                strain = line[1].split("=")
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open("".join([
                                    os.path.join(out_path, filename[:-4]),
                                    "_STRAIN_", strain[1], ".wig"]), "w")
                                out_t = open("".join([
                                    os.path.join(wig_folder, "tmp",
                                                 filename[:-4]),
                                    "_STRAIN_", strain[1], ".wig"]), "w")
                                if track_info != "":
                                    out.write(track_info)
                                    out_t.write(track_info)
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
                            if (line[0] != "track") and (
                                    line[0] != "variableStep"):
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
        out.close()
        out_t.close()
Esempio n. 18
0
 def __init__(self, tar_folder, ref_folder):
     self.multiparser = Multiparser()
     self.seq_editer = SeqEditer()
     self.helper = Helper()
     self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}
Esempio n. 19
0
class SNPCalling(object):
    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "reference":
            file_type = "compare_reference"
        else:
            file_type = "validate_target"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fasta_path = os.path.join(args_snp.fastas, "tmp")
        self.outputs = {
            "table": os.path.join(args_snp.out_folder, file_type, "SNP_table"),
            "raw": os.path.join(args_snp.out_folder, file_type,
                                "SNP_raw_outputs"),
            "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")
        }
        if "whole_reads.bam" in os.listdir(args_snp.out_folder):
            self.helper.remove_all_content(args_snp.out_folder, "whole_read",
                                           "file")
        self.bams = {
            "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"),
            "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")
        }
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {
            "with": "with_BAQ",
            "without": "without_BAQ",
            "extend": "extend_BAQ"
        }

    def _import_bam(self, bam_folder, bams):
        num_bam = 0
        for bam in os.listdir(bam_folder):
            if bam.endswith(".bam"):
                num_bam += 1
                bams.append(os.path.join(bam_folder, bam))
        return num_bam

    def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix,
                        bam_number, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        stat_file = os.path.join(
            self.stat_path,
            "_".join(["stat", "_".join([prefix, self.baqs[type_]]),
                      "SNP.csv"]))
        snp_detect(fasta, snp, out_table_prefix,
                   os.path.join(seq_path,
                                prefix), bam_number, stat_file, args_snp)
        self.helper.move_all_content(table_path, self.stat_path, [".png"])

    def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp):
        if type_ == "with":
            call([
                args_snp.samtools_path, "mpileup", "-t", "DP", "-ugf",
                fasta_file, self.bams["sort"], "--ignore-RG"
            ],
                 stdout=out_bcf)
        elif type_ == "without":
            call([
                args_snp.samtools_path, "mpileup", "-t", "DP", "-B", "-ugf",
                fasta_file, self.bams["sort"], "--ignore-RG"
            ],
                 stdout=out_bcf)
        elif type_ == "extend":
            call([
                args_snp.samtools_path, "mpileup", "-t", "DP", "-E", "-ugf",
                fasta_file, self.bams["sort"], "--ignore-RG"
            ],
                 stdout=out_bcf)
        out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"])
        if args_snp.chrom == "1":
            call([
                args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                self.outputs["tmp"], "-vmO", "v", "-o", out_vcf
            ])
        elif args_snp.chrom == "2":
            call([
                args_snp.bcftools_path, "call", self.outputs["tmp"], "-vmO",
                "v", "-o", out_vcf
            ])
        return out_vcf

    def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix,
                 table_path, bam_number):
        out_bcf = open(self.outputs["tmp"], "w")
        out_vcf = self._run_tools(fasta_file, out_bcf,
                                  file_prefixs["raw_prefix"], type_, args_snp)
        self.helper.check_make_folder(
            os.path.join(self.seq_path, self.baqs[type_], prefix))
        self._transcript_snp(
            fasta_file, out_vcf,
            "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_,
            prefix, bam_number, table_path, args_snp)
        out_bcf.close()

    def _run_program(self, fasta_file, file_prefixs, prefix, bam_number,
                     table_path, args_snp):
        for index in args_snp.program:
            if index == "1":
                type_ = "with"
                print("Running SNP calling with BAQ...")
            elif index == "2":
                type_ = "without"
                print("Running SNP calling without BAQ...")
            elif index == "3":
                print("Running SNP calling extend BAQ...")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign 1, 2, 3")
                sys.exit()
            self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix,
                          table_path, bam_number)

    def _detect_fasta(self, fasta):
        detect = False
        if fasta.endswith(".fa"):
            prefix = fasta[:-3]
            detect = True
        elif fasta.endswith(".fna"):
            prefix = fasta[:-4]
            detect = True
        elif fasta.endswith(".fasta"):
            prefix = fasta[:-6]
            detect = True
        return (detect, prefix)

    def _run_bam(self, samtools_path, sub_command, bam_file):
        if sub_command == "merge":
            command = (" ".join(
                [samtools_path, sub_command, self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            command = (" ".join([
                samtools_path, sub_command, "-o", bam_file, self.bams["whole"]
            ]))
        os.system(command)

    def _merge_bams(self, args_snp):
        bams = []
        num_normal = 0
        num_frag = 0
        if (args_snp.frag_bams is None) and (args_snp.normal_bams is None):
            print("Error: There is no BAMs folders!!")
            sys.exit()
        else:
            if args_snp.normal_bams is not None:
                num_normal = self._import_bam(args_snp.normal_bams, bams)
            if args_snp.frag_bams is not None:
                num_frag = self._import_bam(args_snp.frag_bams, bams)
        num_bam = num_normal + num_frag
        if num_bam <= 1:
            shutil.copyfile(bams[0], self.bams["whole"])
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"])
        else:
            print("Merge BAM files now ...")
            self._run_bam(args_snp.samtools_path, "merge", " ".join(bams))
            print("Sort BAM file now ...")
            self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"])
        return num_bam

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path):
        command = " ".join([samtools_path, "view", "-H", self.bams["sort"]])
        os.system(">".join([command, self.header]))

    def _get_genome_name(self, samtools_path):
        self._get_header(samtools_path)
        fh = open(self.header, "r")
        seq_names = []
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                seq_names.append(row[1].split(":")[1])
        fh.close()
        return seq_names

    def run_snp_calling(self, args_snp):
        self.multiparser.parser_fasta(args_snp.fastas)
        self._modify_header(args_snp.fastas)
        bam_number = self._merge_bams(args_snp)
        seq_names = self._get_genome_name(args_snp.samtools_path)
        if ("1" not in args_snp.program) and (
                "2" not in args_snp.program) and ("3" not in args_snp.program):
            print("Error:Please assign a correct BAQ type: "
                  "'1' means 'with_BAQ', '2' means 'with_BAQ' or "
                  "'3' means 'extend_BAQ'.")
            sys.exit()
        else:
            for fasta in os.listdir(self.fasta_path):
                if (fasta.split(".f")[0] in seq_names):
                    fasta_datas = self._detect_fasta(fasta)
                    detect = fasta_datas[0]
                    prefix = fasta_datas[1]
                    if detect:
                        detect = False
                        print("Computing {0} now ...".format(fasta))
                        self.helper.check_make_folder(
                            os.path.join(self.outputs["table"], prefix))
                        self.helper.check_make_folder(
                            os.path.join(self.outputs["raw"], prefix))
                        file_prefixs = {
                            "raw_prefix":
                            os.path.join(self.outputs["raw"], prefix, prefix),
                            "table_prefix":
                            os.path.join(self.outputs["table"], prefix, prefix)
                        }
                        fasta_file = os.path.join(self.fasta_path, fasta)
                        table_path = os.path.join(self.outputs["table"],
                                                  prefix)
                        self._run_program(fasta_file, file_prefixs, prefix,
                                          bam_number, table_path, args_snp)
                        os.remove(self.outputs["tmp"])
        self.helper.remove_tmp(args_snp.fastas)
        os.remove(self.bams["whole"])
        os.remove(self.bams["sort"])
        os.remove(self.header)
Esempio n. 20
0
class SNPCalling(object):
    '''detection of SNP'''

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "reference":
            file_type = "compare_reference"
        else:
            file_type = "validate_target"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fasta_path = os.path.join(args_snp.fastas, "tmp")
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_table"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                        "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
        if "whole_reads.bam" in os.listdir(args_snp.out_folder):
            self.helper.remove_all_content(args_snp.out_folder,
                                           "whole_read", "file")
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam"),
                     "bams": []}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _transcript_snp(self, fasta, snp, out_table_prefix, type_,
                        prefix, bam_number, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        stat_prefix = os.path.join(self.stat_path, "_".join([
            "stat", "_".join([prefix, self.baqs[type_]]), "SNP"]))
        snp_detect(fasta, snp, self.outputs["depth"], out_table_prefix,
                   os.path.join(seq_path, prefix), bam_number,
                   stat_prefix, args_snp)
        self.helper.move_all_content(table_path, self.stat_path, [".png"])

    def _get_para(self, args_snp):
        bams = self.bams["sort"]
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bams, bcf_para

    def _run_tools(self, fasta_file, out_raw_prefix, type_, args_snp):
        bams, bcf_para = self._get_para(args_snp)
        if type_ == "with":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
        elif type_ == "without":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
        elif type_ == "extend":
            command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
        if args_snp.rg:
            command = command + ["-ugf", fasta_file, bams]
        else:
            command = command + ["--ignore-RG", "-ugf", fasta_file, bams]
        os.system(" ".join(command) + ">" + self.outputs["tmp"])
        out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"])
        if args_snp.chrom == "1":
            call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                  self.outputs["tmp"], bcf_para, "v", "-o", out_vcf])
        elif args_snp.chrom == "2":
            call([args_snp.bcftools_path, "call",
                  self.outputs["tmp"], bcf_para, "v", "-o", out_vcf])
        return out_vcf

    def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix,
                 table_path, bam_number):
        out_vcf = self._run_tools(fasta_file, file_prefixs["raw_prefix"],
                                  type_, args_snp)
        self.helper.check_make_folder(
             os.path.join(self.seq_path, self.baqs[type_], prefix))
        self._transcript_snp(
            fasta_file, out_vcf,
            "_".join([file_prefixs["table_prefix"], self.baqs[type_]]),
            type_, prefix, bam_number, table_path, args_snp)

    def _run_program(self, fasta_file, file_prefixs, prefix, bam_number,
                     table_path, args_snp):
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                sys.exit()
            self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix,
                          table_path, bam_number)

    def _detect_fasta(self, fasta):
        detect = False
        if fasta.endswith(".fa"):
            prefix = fasta[:-3]
            detect = True
        elif fasta.endswith(".fna"):
            prefix = fasta[:-4]
            detect = True
        elif fasta.endswith(".fasta"):
            prefix = fasta[:-6]
            detect = True
        return (detect, prefix)

    def _run_bam(self, samtools_path, sub_command, bam_file):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            command = (" ".join([samtools_path, sub_command,
                                 "-o", bam_file, self.bams["whole"]]))
        os.system(command)
        self.bams["bams"].append(bam_file.replace(".bam", "_sort.bam"))

    def _merge_bams(self, args_snp):
        bams = []
        num_normal = 0
        num_frag = 0
        if (args_snp.bams is None):
            print("Error: There is no BAMs folders!!")
            sys.exit()
        else:
            num_bam = 0
            for files in args_snp.bams:
                for bam in glob(files):
                    bams.append(bam)
                    num_bam += 1
        if num_bam <= 1:
            shutil.copyfile(bams[0], self.bams["whole"])
            print("Sorting BAM file now")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        else:
            print("Merging BAM files now")
            self._run_bam(args_snp.samtools_path, "merge",
                          " ".join(bams))
            print("Sorting BAM file now")
            self._run_bam(args_snp.samtools_path, "sort",
                          self.bams["sort"])
        out_depth = open(self.outputs["depth"], "w")
        call([args_snp.samtools_path, "index",  self.bams["sort"]])
        call([args_snp.samtools_path, "depth",  self.bams["sort"]],
             stdout=out_depth)
        return num_bam

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp):
        seq_names = []
        self._get_header(args_snp.samtools_path, self.bams["sort"],
                         seq_names)
        return seq_names

    def _remove_bams(self):
        if os.path.exists(self.bams["whole"]):
            os.remove(self.bams["whole"])
        if os.path.exists(self.bams["whole"] + ".bai"):
            os.remove(self.bams["whole"] + ".bai")
        if os.path.exists(self.bams["sort"]):
            os.remove(self.bams["sort"])
        if os.path.exists(self.bams["sort"] + ".bai"):
            os.remove(self.bams["sort"] + ".bai")
        if os.path.exists(self.header):
            os.remove(self.header)
        os.remove(self.outputs["depth"])

    def run_snp_calling(self, args_snp):
        self.multiparser.parser_fasta(args_snp.fastas)
        self._modify_header(args_snp.fastas)
        bam_number = self._merge_bams(args_snp)
        seq_names = self._get_genome_name(args_snp)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ" not in args_snp.program) and (
                "extend_BAQ" not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            for fasta in os.listdir(self.fasta_path):
                if (fasta.split(".f")[0] in seq_names):
                    fasta_datas = self._detect_fasta(fasta)
                    detect = fasta_datas[0]
                    prefix = fasta_datas[1]
                    if detect:
                        detect = False
                        print("Computing {0} now".format(fasta))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["table"], prefix))
                        self.helper.check_make_folder(
                             os.path.join(self.outputs["raw"], prefix))
                        file_prefixs = {"raw_prefix": os.path.join(
                                        self.outputs["raw"], prefix, prefix),
                                        "table_prefix": os.path.join(
                                        self.outputs["table"], prefix, prefix)}
                        fasta_file = os.path.join(self.fasta_path, fasta)
                        table_path = os.path.join(self.outputs["table"],
                                                  prefix)
                        self._run_program(fasta_file, file_prefixs, prefix,
                                          bam_number, table_path, args_snp)
                        os.remove(self.outputs["tmp"])
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams()
Esempio n. 21
0
class Multiparser(object):
    def __init__(self):
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.tmp_fa = "tmp.fa"
        self.tmp_gff = "tmp.gff"
        self.tmp_wig_forward = "tmp_forward.wig"
        self.tmp_wig_reverse = "tmp_reverse.wig"

    def combine_fasta(self, ref_folder, tar_folder, ref_feature):
        '''combine multiple fasta files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging fasta files of " + prefix)
                for file_ in os.listdir("/".join([ref_folder, folder])):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    if tar.endswith(".fa") or \
                       tar.endswith(".fna") or \
                       tar.endswith(".fasta"):
                        filename = ".".join((tar.split("."))[:-1])
                        for file_ in files:
                            if filename == file_:
                                self.helper.merge_file(
                                    os.path.join(tar_folder, tar),
                                    os.path.join(tar_folder, self.tmp_fa))
                                change = True
                if change:
                    change = False
                    shutil.move(os.path.join(tar_folder, self.tmp_fa),
                                os.path.join(tar_merge, prefix + ".fa"))
        self.helper.remove_all_content(tar_folder, ".fa", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def get_prefix(self, folder, ref_feature):
        datas = folder.split("_folder")
        if ref_feature == "":
            prefix = datas[0][:-4]
        elif ref_feature == "_fasta":
            if datas[0].endswith(".fa"):
                prefix = datas[0][:-3]
            elif datas[0].endswith(".fna"):
                prefix = datas[0][:-4]
            elif datas[0].endswith(".fasta"):
                prefix = datas[0][:-6]
        else:
            datas = datas[0][:-4]
            datas = datas.split(ref_feature)
            prefix = datas[0]
        return prefix

    def combine_wig(self, ref_folder, tar_folder, ref_feature, libs):
        '''combine multiple wig files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change_f = False
        change_r = False
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                prefix = self.get_prefix(folder, ref_feature)
                print("Merging wig files of " + prefix)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    filename = tar.split("_STRAIN_")
                    for file_ in files:
                        if (tar.endswith(".wig")) and (file_
                                                       == filename[-1][:-4]):
                            for lib in libs:
                                if (filename[0] in lib) and (lib[-1] == "+"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_forward))
                                    change_f = True
                                elif (filename[0] in lib) and (lib[-1] == "-"):
                                    self.helper.merge_file(
                                        os.path.join(tar_folder, tar),
                                        os.path.join(tar_folder,
                                                     self.tmp_wig_reverse))
                                    change_r = True
                if change_f and change_r:
                    change_f = False
                    change_r = False
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_wig_forward),
                        os.path.join(tar_merge, prefix + "_forward.wig"))
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_wig_reverse),
                        os.path.join(tar_merge, prefix + "_reverse.wig"))
                else:
                    print("Error: comparing input files of {0} failed. "
                          "Please check the seq IDs of all gff and fasta "
                          "files, they should be the same.\nPlease "
                          "also check the wiggle files which should contain "
                          "forward and reverse files.".format(prefix))
                    sys.exit()
        self.helper.remove_all_content(tar_folder, ".wig", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature):
        '''combine multiple gff files'''
        tar_merge = os.path.join(tar_folder, "merge_tmp")
        change = False
        if tar_feature is None:
            tar_feature = ""
        else:
            tar_feature = "_" + tar_feature
        if ref_feature is None:
            ref_feature = ""
        else:
            ref_feature = "_" + ref_feature
        self.helper.check_make_folder(tar_merge)
        for folder in os.listdir(ref_folder):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                if ref_feature == "":
                    prefix = datas[0][:-4]
                elif ref_feature == "_fasta":
                    if datas[0].endswith(".fa"):
                        prefix = datas[0][:-3]
                    elif datas[0].endswith(".fna"):
                        prefix = datas[0][:-4]
                    elif datas[0].endswith(".fasta"):
                        prefix = datas[0][:-6]
                else:
                    datas = datas[0][:-4]
                    datas = datas.split(ref_feature)
                    prefix = datas[0]
                print("Merging gff files of " + prefix + tar_feature)
                for file_ in os.listdir(os.path.join(ref_folder, folder)):
                    if ref_feature == "":
                        files.append(file_[:-4])
                    elif ref_feature == "_fasta":
                        files.append(file_[:-3])
                    else:
                        filename = file_.split(ref_feature)
                        files.append(filename[0])
                for tar in os.listdir(tar_folder):
                    for file_ in files:
                        if (".gff" in tar) and (file_ + tar_feature
                                                == tar[:-4]):
                            self.helper.merge_file(
                                os.path.join(tar_folder, tar),
                                os.path.join(tar_folder, self.tmp_gff))
                            change = True
                if change:
                    change = False
                    shutil.move(
                        os.path.join(tar_folder, self.tmp_gff),
                        os.path.join(tar_folder, "merge_tmp",
                                     prefix + tar_feature + ".gff"))
        self.helper.remove_all_content(tar_folder, ".gff", "file")
        self.helper.move_all_content(tar_merge, tar_folder, None)
        shutil.rmtree(tar_merge)

    def parser_fasta(self, fastas):
        '''parser the fasta file based on strain'''
        par_tmp = os.path.join(fastas, "tmp")
        first = True
        out = None
        out_t = None
        detect = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fasta") or fasta.endswith(".fa")
                    or fasta.endswith(".fna")):
                detect = True
                self.seq_editer.modify_header(os.path.join(fastas, fasta))
        self.helper.check_make_folder(par_tmp)
        if not detect:
            print("Error: there are folders which conatin no fasta files! "
                  "The files should end with .fa or .fna or .fasta!")
            sys.exit()
        for fasta in os.listdir(fastas):
            if ("_folder" not in fasta) and ("tmp" != fasta):
                if (fasta.endswith(".fa")) or \
                   (fasta.endswith(".fna")) or \
                   (fasta.endswith(".fasta")):
                    out_path = os.path.join(fastas, fasta + "_folder")
                    print("Parsing " + fasta)
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(fastas, fasta), "r") as f_f:
                        for line in f_f:
                            if line[0] == ">":
                                line = line.strip()
                                if ("|" in line) and (len(line.split("|")) >
                                                      4):
                                    strain = line.split("|")
                                    name = strain[3]
                                else:
                                    name = line[1:]
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    os.path.join(out_path, name + ".fa"), "w")
                                out_t = open(
                                    os.path.join(par_tmp, name + ".fa"), "w")
                                out.write(">" + name + "\n")
                                out_t.write(">" + name + "\n")
                            else:
                                out.write(line)
                                out_t.write(line)
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()

    def parser_gff(self, gff_folder, feature):
        '''parser gff file based on strain'''
        par_tmp = os.path.join(gff_folder, "tmp")
        out = None
        out_t = None
        first = True
        detect = False
        if feature is None:
            feature = ""
        else:
            feature = "_" + feature
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(gff_folder):
            pre_seq_id = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(gff_folder, filename + "_folder")
                if ".gff" in filename:
                    detect = True
                    print("Parsing " + filename)
                    self.helper.check_make_folder(out_path)
                    self.helper.sort_gff(os.path.join(gff_folder, filename),
                                         os.path.join(gff_folder, "tmp.gff"))
                    f_h = open(os.path.join(gff_folder, "tmp.gff"), "r")
                    for row in csv.reader(f_h, delimiter="\t"):
                        if row[0].startswith("#"):
                            continue
                        else:
                            if pre_seq_id == row[0]:
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                            else:
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    os.path.join(out_path,
                                                 row[0] + feature + ".gff"),
                                    "w")
                                out_t = open(
                                    os.path.join(par_tmp,
                                                 row[0] + feature + ".gff"),
                                    "w")
                                pre_seq_id = row[0]
                                out.write("\t".join(row) + "\n")
                                out_t.write("\t".join(row) + "\n")
                    f_h.close()
        if not detect:
            print("Error: There are folders which contain no gff3 files! "
                  "The files should end with .gff!")
            sys.exit()
        if os.path.exists(os.path.join(gff_folder, "tmp.gff")):
            os.remove(os.path.join(gff_folder, "tmp.gff"))
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()

    def parser_wig(self, wig_folder):
        '''parser the wig file based on strain'''
        par_tmp = os.path.join(wig_folder, "tmp")
        first = True
        out = None
        out_t = None
        detect = False
        self.helper.check_make_folder(par_tmp)
        for filename in os.listdir(wig_folder):
            track_info = ""
            if ("_folder" not in filename) and ("tmp" != filename):
                out_path = os.path.join(wig_folder, filename + "_folder")
                if ".wig" in filename:
                    detect = True
                    print("Parsing {0}".format(filename))
                    self.helper.check_make_folder(out_path)
                    with open(os.path.join(wig_folder, filename), "r") as w_f:
                        for line in w_f:
                            line = line.split(" ")
                            if (line[0] == "track"):
                                track_info = " ".join(line)
                            if (line[0] == "variableStep"):
                                strain = line[1].split("=")
                                if first:
                                    first = False
                                else:
                                    out.close()
                                    out_t.close()
                                out = open(
                                    "".join([
                                        os.path.join(out_path, filename[:-4]),
                                        "_STRAIN_", strain[1], ".wig"
                                    ]), "w")
                                out_t = open(
                                    "".join([
                                        os.path.join(wig_folder, "tmp",
                                                     filename[:-4]),
                                        "_STRAIN_", strain[1], ".wig"
                                    ]), "w")
                                if track_info != "":
                                    out.write(track_info)
                                    out_t.write(track_info)
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
                            if (line[0] != "track") and (line[0] !=
                                                         "variableStep"):
                                out.write(" ".join(line))
                                out_t.write(" ".join(line))
        if not detect:
            print("Error: There are folders which contain no wig files! "
                  "The files should end with .wig!")
            sys.exit()
        if out is not None:
            out.close()
        if out_t is not None:
            out_t.close()
Esempio n. 22
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''

    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if os.path.exists(os.path.join(out_folder, "fasta_files")):
            shutil.rmtree(os.path.join(out_folder, "fasta_files"))
            os.mkdir(os.path.join(out_folder, "fasta_files"))
        if os.path.exists(self.folders["tmp_tar"]):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files,
                         out_name, out_folder, log):
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        log.write("Running seq_editor.py for updating sequence.\n")
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"], out_name)
        print("Updating the reference sequences")
        mh = open(mut_table, "r")
        pre_strain = None
        out = None
        strain_num = 0
        for row in csv.reader(mh, delimiter='\t'):
            if not row[0].startswith("#"):
                if (pre_strain != row[0]):
                    strain_num = strain_num + 1
                    tmp_tar_name = "_".join([out_name, row[0]]) + ".fa"
                    fasta = os.path.join(out_folder, "fasta_files",
                                         tmp_tar_name)
                    if out is not None:
                        out.close()
                    out = open(fasta, "w")
                    if tmp_tar_name in os.listdir(self.folders["tmp_tar"]):
                        with open(os.path.join(
                                  self.folders["tmp_tar"],
                                  tmp_tar_name)) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error: No updated information of {0}.fa".format(
                              row[0]))
                pre_strain = row[0]
        out.close()
        out_seq = out_name + ".fa"
        if os.path.exists(out_seq):
            os.remove(out_seq)
        if strain_num == 1:
            o_s = open(out_seq, "w")
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    with open(os.path.join(
                            out_folder, "fasta_files", seq)) as t_h:
                        for line in t_h:
                            if len(line) != 0:
                                if line.startswith(">"):
                                    o_s.write(">" + out_name + "\n")
                                else:
                                     o_s.write(line)
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
            o_s.close()
        else:
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    os.system(" ".join(["cat", os.path.join(
                                            out_folder, "fasta_files", seq),
                                        ">>", out_seq]))
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
        shutil.move(out_seq, os.path.join(
            out_folder, "fasta_files", out_seq))
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) + 
                  " is generated.\n")
        print("Please use the new fasta files to remapping again.")
Esempio n. 23
0
class SNPCalling(object):
    '''detection of SNP'''

    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "related_genome":
            file_type = "compare_related_and_reference_genomes"
        else:
            file_type = "mutations_of_reference_genomes"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fig_path = os.path.join(self.stat_path, "figs")
        self.helper.check_make_folder(self.fig_path)
        self.outputs = {"table": os.path.join(
                        args_snp.out_folder, file_type, "SNP_tables"),
                        "raw": os.path.join(
                        args_snp.out_folder, file_type, "SNP_raw_outputs"),
                        "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
                        "depth": os.path.join(args_snp.out_folder, "tmp_depth")}
        self.bams = {"whole": os.path.join(args_snp.out_folder,
                                           "whole_reads.bam"),
                     "sort": os.path.join(args_snp.out_folder,
                                          "whole_reads_sorted.bam"),
                     "bams": []}
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {"with": "with_BAQ", "without": "without_BAQ",
                     "extend": "extend_BAQ"}

    def _transcript_snp(self, fasta, out_table_prefix, type_,
                        prefix, bam_datas, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        for bam in bam_datas:
            stat_prefix = os.path.join(self.stat_path, "_".join([
                "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]),
                "SNP"]))
            snp_file = os.path.join(self.outputs["raw"], prefix, "_".join(
                [prefix, self.baqs[type_], bam["sample"] + ".vcf"]))
            snp_detect(
                fasta, snp_file, self.outputs["depth"] + bam["sample"],
                "_".join([out_table_prefix, bam["sample"]]),
                os.path.join(seq_path, "_".join([prefix, bam["sample"]])),
                bam["bam_number"], stat_prefix, args_snp, bam["rep"])
            self.helper.move_all_content(table_path, self.fig_path, [".png"])

    def _get_para(self, args_snp):
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bcf_para

    def _run_tools(self, fasta_file, type_, args_snp, bam_datas, log):
        bcf_para = self._get_para(args_snp)
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if type_ == "with":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
            elif type_ == "without":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
            elif type_ == "extend":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
            if args_snp.rg:
                command = command + ["-ugf", fasta_file, bam_file]
            else:
                command = command + ["--ignore-RG", "-ugf", fasta_file, bam_file]
            log.write(" ".join(command) + ">" + self.outputs["tmp"] + "\n")
            os.system(" ".join(command) + ">" + self.outputs["tmp"])
            bam["vcf"] = os.path.join(self.outputs["raw"], "_".join(
                [self.baqs[type_], bam["sample"] + ".vcf"]))
            if args_snp.chrom == "1":
                log.write(" ".join([
                      args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n")
                call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]])
            elif args_snp.chrom == "2":
                log.write(" ".join([args_snp.bcftools_path, "call",
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n")
                call([args_snp.bcftools_path, "call",
                      self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]])
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        for file_ in os.listdir(self.outputs["raw"]):
            log.write("\t" + os.path.join(self.outputs["raw"], file_) + "\n")

    def _parse_vcf_by_fa(self, args_snp, type_, num_prog, log):
        seq_names = []
        fa_prefixs = []
        log.write("Parsing Vcf files by comparing fasta information.\n")
        for fa in os.listdir(args_snp.fastas):
            if (fa != "all.fa") and (not fa.endswith(".fai")):
                with open(os.path.join(args_snp.fastas, fa)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            seq_names.append(line[1:])
                fa_prefix = ".".join(fa.split(".")[:-1])
                fa_prefixs.append(fa_prefix)
                vcf_folder = os.path.join(
                    self.outputs["raw"], fa_prefix)
                if num_prog == 0:
                    self.helper.check_make_folder(vcf_folder)
                    self.helper.check_make_folder(os.path.join(
                        self.outputs["table"], fa_prefix))
                self.helper.check_make_folder(
                    os.path.join(self.seq_path, self.baqs[type_], fa_prefix))
                for vcf in os.listdir(self.outputs["raw"]):
                    if vcf.endswith(".vcf"):
                        out = open(os.path.join(vcf_folder, "_".join(
                            [fa_prefix, vcf])), "w")
                        with open(os.path.join(self.outputs["raw"],
                                  vcf)) as vh:
                            for line in vh:
                                line = line.strip()
                                if line.startswith("#"):
                                    out.write(line + "\n")
                                else:
                                    if line.split("\t")[0] in seq_names:
                                        out.write(line + "\n")
                        out.close()
                        log.write("\t" + os.path.join(vcf_folder, "_".join(
                            [fa_prefix, vcf])) + " is generated.\n")
        for vcf in os.listdir(self.outputs["raw"]):
            if vcf.endswith(".vcf"):
                os.remove(os.path.join(self.outputs["raw"], vcf))
        return fa_prefixs

    def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog, log):
        self._run_tools(all_fasta, type_, args_snp, bam_datas, log)
        fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog, log)
        log.write("Running transcript_SNP.py to do statistics, filter SNPs, "
                  "and generate potential sequences.\n")
        log.write("The following files are generated:\n")
        for fa_prefix in fa_prefixs:
            for fasta in os.listdir(args_snp.fastas):
                if fa_prefix in fasta:
                    fasta_file = os.path.join(args_snp.fastas, fasta)
            table_path = os.path.join(self.outputs["table"], fa_prefix)
            table_prefix = os.path.join(table_path, "_".join(
                [fa_prefix, self.baqs[type_]]))
            self._transcript_snp(
                fasta_file, table_prefix,
                type_, fa_prefix, bam_datas, table_path, args_snp)
            seq_path = os.path.join(self.seq_path, self.baqs[type_], fa_prefix)
            for folder in (table_path, self.stat_path, seq_path, self.fig_path):
                for file_ in os.listdir(folder):
                    if os.path.isfile(os.path.join(folder, file_)):
                        log.write("\t" + os.path.join(folder, file_) + "\n")

    def _run_program(self, all_fasta, bam_datas, args_snp, log):
        num_prog = 0
        log.write("Running Samtools to mpileup, and using Bcftools to "
                  "call snp.\n")
        log.write("Please make sure the version of Samtools and Bcftools "
                  "are both at least 1.3.1.\n")
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
                log.write("Running SNP calling with BAQ.\n")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
                log.write("Running SNP calling without BAQ.\n")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                log.write("Running SNP calling extend BAQ.\n")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                log.write("No valid program can be found, please assign"
                          "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".\n")
                sys.exit()
            self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog, log)
            num_prog += 1

    def _run_bam(self, samtools_path, sub_command, bam_file, type_file, log):
        if sub_command == "merge":
            command = (" ".join([samtools_path, sub_command,
                       self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            if type_file == "all":
                command = (" ".join([samtools_path, sub_command,
                                     "-o", bam_file, self.bams["whole"]]))
            else:
                command = (" ".join([samtools_path, sub_command,
                                     "-o",
                                     bam_file, type_file]))
        log.write(command + "\n")
        os.system(command)

    def _merge_bams(self, args_snp, bam_datas, log):
        bams = []
        num_normal = 0
        num_frag = 0
        log.write("Using Samtools to merge and sort BAM files.\n")
        log.write("Please make sure the version of Samtools is at least 1.3.1.\n")
        for bam in bam_datas:
            bam["bam_number"] = 0
            out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam")
            if len(bam["bams"]) == 1:
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(
                    args_snp.samtools_path, "sort",
                    out_bam, bam["bams"][0], log)
                bam["bam_number"] = 1
            else:
                print("Merging BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "merge",
                              " ".join(bam["bams"]), "all", log)
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(
                    args_snp.samtools_path, "sort",
                    out_bam, "all", log)
                bam["bam_number"] += 1
            if os.path.exists(self.bams["whole"]):
                os.remove(self.bams["whole"])
            out_depth = open(self.outputs["depth"] + bam["sample"], "w")
            log.write(" ".join([args_snp.samtools_path, "index",  out_bam]) + "\n")
            call([args_snp.samtools_path, "index",  out_bam])
            log.write(" ".join([args_snp.samtools_path, "depth",  out_bam]) + "\n")
            call([args_snp.samtools_path, "depth",  out_bam],
                 stdout=out_depth)
            out_depth.close()
        log.write("Done!\n")
        log.write("The following files are generated:\n")
        log.write("\t" + self.bams["whole"] + " is temporary generated "
                  "(be deleted afterward).\n")
        for file_ in os.listdir(args_snp.out_folder):
            if os.path.isfile(os.path.join(args_snp.out_folder, file_)):
                log.write("\t" + os.path.join(args_snp.out_folder, file_) + "\n")
        

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                if row[1].split(":")[1] not in seq_names:
                    seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp, bam_datas):
        seq_names = []
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            self._get_header(args_snp.samtools_path,
                             bam_file, seq_names)
        return seq_names

    def _remove_bams(self, bam_datas, args_snp):
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if os.path.exists(bam_file):
                os.remove(bam_file)
            if os.path.exists(bam_file + ".bai"):
                os.remove(bam_file + ".bai")
            if os.path.exists(self.header):
                os.remove(self.header)
            os.remove(self.outputs["depth"] + bam["sample"])

    def _extract_bams(self, bams, log):
        bam_datas = []
        for bam in bams:
            datas = bam.split(":")
            if len(datas) != 2:
                log.write("the format of --bam_files is wrong!\n")
                print("Error: the format of --bam_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: there are some Bam files "
                          "which do not exist!")
                    log.write(file_ + " is not found.\n")
                    sys.exit()
            bam_datas.append({"sample": datas[0],
                              "rep": len(datas[-1].split(",")),
                              "bams": datas[-1].split(",")})
        return bam_datas

    def _merge_fasta(self, fastas, log):
        all_fasta = os.path.join(fastas, "all.fa")
        names = []
        out = open(all_fasta, "w")
        print_ = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fa")) or (
                    fasta.endswith(".fasta")) or (
                    fasta.endswith(".fna")):
                with open(os.path.join(fastas, fasta)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            if line not in names:
                                print_ = True
                                names.append(line)
                            else:
                                print_ = False
                        if print_:
                            out.write(line + "\n")
                log.write(os.path.join(fastas, fasta) + " is loaded.\n")
        out.close()
        return all_fasta

    def run_snp_calling(self, args_snp, log):
        self._modify_header(args_snp.fastas)
        all_fasta = self._merge_fasta(args_snp.fastas, log)
        bam_datas = self._extract_bams(args_snp.bams, log)
        self._merge_bams(args_snp, bam_datas, log)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ" not in args_snp.program) and (
                "extend_BAQ" not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            print("Detecting mutations now")
            self._run_program(all_fasta, bam_datas, args_snp, log)
            os.remove(self.outputs["tmp"])
            os.remove(all_fasta)
            os.remove(all_fasta + ".fai")
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams(bam_datas, args_snp)
        log.write("Remove all the temporary files.\n")
Esempio n. 24
0
class SNPCalling(object):
    '''detection of SNP'''
    def __init__(self, args_snp):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        if args_snp.types == "related_genome":
            file_type = "compare_related_and_reference_genomes"
        else:
            file_type = "mutations_of_reference_genomes"
        self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs")
        self.stat_path = os.path.join(args_snp.out_folder, file_type,
                                      "statistics")
        self.fig_path = os.path.join(self.stat_path, "figs")
        self.helper.check_make_folder(self.fig_path)
        self.outputs = {
            "table": os.path.join(args_snp.out_folder, file_type,
                                  "SNP_tables"),
            "raw": os.path.join(args_snp.out_folder, file_type,
                                "SNP_raw_outputs"),
            "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"),
            "depth": os.path.join(args_snp.out_folder, "tmp_depth")
        }
        self.bams = {
            "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"),
            "sort": os.path.join(args_snp.out_folder,
                                 "whole_reads_sorted.bam"),
            "bams": []
        }
        self.header = os.path.join(args_snp.out_folder, "header")
        self.baqs = {
            "with": "with_BAQ",
            "without": "without_BAQ",
            "extend": "extend_BAQ"
        }

    def _transcript_snp(self, fasta, out_table_prefix, type_, prefix,
                        bam_datas, table_path, args_snp):
        seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix)
        for bam in bam_datas:
            stat_prefix = os.path.join(
                self.stat_path, "_".join([
                    "stat", "_".join([prefix, self.baqs[type_],
                                      bam["sample"]]), "SNP"
                ]))
            snp_file = os.path.join(
                self.outputs["raw"], prefix,
                "_".join([prefix, self.baqs[type_], bam["sample"] + ".vcf"]))
            snp_detect(
                fasta, snp_file, self.outputs["depth"] + bam["sample"],
                "_".join([out_table_prefix, bam["sample"]]),
                os.path.join(seq_path, "_".join([prefix, bam["sample"]])),
                bam["bam_number"], stat_prefix, args_snp, bam["rep"])
            self.helper.move_all_content(table_path, self.fig_path, [".png"])

    def _get_para(self, args_snp):
        if args_snp.caller == "c":
            bcf_para = "-vcO"
        else:
            bcf_para = "-vmO"
        return bcf_para

    def _run_tools(self, fasta_file, type_, args_snp, bam_datas):
        bcf_para = self._get_para(args_snp)
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if type_ == "with":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP"]
            elif type_ == "without":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"]
            elif type_ == "extend":
                command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"]
            if args_snp.rg:
                command = command + ["-ugf", fasta_file, bam_file]
            else:
                command = command + [
                    "--ignore-RG", "-ugf", fasta_file, bam_file
                ]
            os.system(" ".join(command) + ">" + self.outputs["tmp"])
            bam["vcf"] = os.path.join(
                self.outputs["raw"],
                "_".join([self.baqs[type_], bam["sample"] + ".vcf"]))
            if args_snp.chrom == "1":
                call([
                    args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom,
                    self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]
                ])
            elif args_snp.chrom == "2":
                call([
                    args_snp.bcftools_path, "call", self.outputs["tmp"],
                    bcf_para, "v", "-o", bam["vcf"]
                ])

    def _parse_vcf_by_fa(self, args_snp, type_, num_prog):
        seq_names = []
        fa_prefixs = []
        for fa in os.listdir(args_snp.fastas):
            if (fa != "all.fa") and (not fa.endswith(".fai")):
                with open(os.path.join(args_snp.fastas, fa)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            seq_names.append(line[1:])
                fa_prefix = ".".join(fa.split(".")[:-1])
                fa_prefixs.append(fa_prefix)
                vcf_folder = os.path.join(self.outputs["raw"], fa_prefix)
                if num_prog == 0:
                    self.helper.check_make_folder(vcf_folder)
                    self.helper.check_make_folder(
                        os.path.join(self.outputs["table"], fa_prefix))
                self.helper.check_make_folder(
                    os.path.join(self.seq_path, self.baqs[type_], fa_prefix))
                for vcf in os.listdir(self.outputs["raw"]):
                    if vcf.endswith(".vcf"):
                        out = open(
                            os.path.join(vcf_folder, "_".join([fa_prefix,
                                                               vcf])), "w")
                        with open(os.path.join(self.outputs["raw"],
                                               vcf)) as vh:
                            for line in vh:
                                line = line.strip()
                                if line.startswith("#"):
                                    out.write(line + "\n")
                                else:
                                    if line.split("\t")[0] in seq_names:
                                        out.write(line + "\n")
                        out.close()
        for vcf in os.listdir(self.outputs["raw"]):
            if vcf.endswith(".vcf"):
                os.remove(os.path.join(self.outputs["raw"], vcf))
        return fa_prefixs

    def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog):
        self._run_tools(all_fasta, type_, args_snp, bam_datas)
        fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog)
        for fa_prefix in fa_prefixs:
            for fasta in os.listdir(args_snp.fastas):
                if fa_prefix in fasta:
                    fasta_file = os.path.join(args_snp.fastas, fasta)
            table_path = os.path.join(self.outputs["table"], fa_prefix)
            table_prefix = os.path.join(
                table_path, "_".join([fa_prefix, self.baqs[type_]]))
            self._transcript_snp(fasta_file, table_prefix, type_, fa_prefix,
                                 bam_datas, table_path, args_snp)

    def _run_program(self, all_fasta, bam_datas, args_snp):
        num_prog = 0
        for index in args_snp.program:
            if index == "with_BAQ":
                type_ = "with"
                print("Running SNP calling with BAQ")
            elif index == "without_BAQ":
                type_ = "without"
                print("Running SNP calling without BAQ")
            elif index == "extend_BAQ":
                print("Running SNP calling extend BAQ")
                type_ = "extend"
            else:
                print("Error: No correct program, please assign "
                      "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!")
                sys.exit()
            self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog)
            num_prog += 1

    def _run_bam(self, samtools_path, sub_command, bam_file, type_file):
        if sub_command == "merge":
            command = (" ".join(
                [samtools_path, sub_command, self.bams["whole"], bam_file]))
        elif sub_command == "sort":
            if type_file == "all":
                command = (" ".join([
                    samtools_path, sub_command, "-o", bam_file,
                    self.bams["whole"]
                ]))
            else:
                command = (" ".join(
                    [samtools_path, sub_command, "-o", bam_file, type_file]))
        os.system(command)

    def _merge_bams(self, args_snp, bam_datas):
        bams = []
        num_normal = 0
        num_frag = 0
        for bam in bam_datas:
            bam["bam_number"] = 0
            out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam")
            if len(bam["bams"]) == 1:
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "sort", out_bam,
                              bam["bams"][0])
                bam["bam_number"] = 1
            else:
                print("Merging BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "merge",
                              " ".join(bam["bams"]), "all")
                print("Sorting BAM files of " + bam["sample"])
                self._run_bam(args_snp.samtools_path, "sort", out_bam, "all")
                bam["bam_number"] += 1
            if os.path.exists(self.bams["whole"]):
                os.remove(self.bams["whole"])
            out_depth = open(self.outputs["depth"] + bam["sample"], "w")
            call([args_snp.samtools_path, "index", out_bam])
            call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth)
            out_depth.close()

    def _modify_header(self, fastas):
        for fasta in os.listdir(fastas):
            if fasta.endswith("fasta") or \
               fasta.endswith("fa") or \
               fasta.endswith("fna"):
                self.seq_editer.modify_header(os.path.join(fastas, fasta))

    def _get_header(self, samtools_path, bam, seq_names):
        command = " ".join([samtools_path, "view", "-H", bam])
        os.system(">".join([command, self.header]))
        fh = open(self.header, "r")
        for row in csv.reader(fh, delimiter="\t"):
            if row[0] == "@SQ":
                if row[1].split(":")[1] not in seq_names:
                    seq_names.append(row[1].split(":")[1])
        fh.close()

    def _get_genome_name(self, args_snp, bam_datas):
        seq_names = []
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            self._get_header(args_snp.samtools_path, bam_file, seq_names)
        return seq_names

    def _remove_bams(self, bam_datas, args_snp):
        for bam in bam_datas:
            bam_file = os.path.join(args_snp.out_folder,
                                    bam["sample"] + ".bam")
            if os.path.exists(bam_file):
                os.remove(bam_file)
            if os.path.exists(bam_file + ".bai"):
                os.remove(bam_file + ".bai")
            if os.path.exists(self.header):
                os.remove(self.header)
            os.remove(self.outputs["depth"] + bam["sample"])

    def _extract_bams(self, bams):
        bam_datas = []
        for bam in bams:
            datas = bam.split(":")
            if len(datas) != 2:
                print("Error: the format of --bam_files is wrong!")
                sys.exit()
            for file_ in datas[-1].split(","):
                if not os.path.exists(file_):
                    print("Error: there are some Bam files "
                          "which do not exist!")
                    sys.exit()
            bam_datas.append({
                "sample": datas[0],
                "rep": len(datas[-1].split(",")),
                "bams": datas[-1].split(",")
            })
        return bam_datas

    def _merge_fasta(self, fastas):
        all_fasta = os.path.join(fastas, "all.fa")
        names = []
        out = open(all_fasta, "w")
        print_ = False
        for fasta in os.listdir(fastas):
            if (fasta.endswith(".fa")) or (fasta.endswith(".fasta")) or (
                    fasta.endswith(".fna")):
                with open(os.path.join(fastas, fasta)) as fh:
                    for line in fh:
                        line = line.strip()
                        if line.startswith(">"):
                            if line not in names:
                                print_ = True
                                names.append(line)
                            else:
                                print_ = False
                        if print_:
                            out.write(line + "\n")
        out.close()
        return all_fasta

    def run_snp_calling(self, args_snp):
        self._modify_header(args_snp.fastas)
        all_fasta = self._merge_fasta(args_snp.fastas)
        bam_datas = self._extract_bams(args_snp.bams)
        self._merge_bams(args_snp, bam_datas)
        if ("with_BAQ" not in args_snp.program) and (
                "without_BAQ"
                not in args_snp.program) and ("extend_BAQ"
                                              not in args_snp.program):
            print("Error: Please assign a correct programs: "
                  "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".")
            sys.exit()
        else:
            print("Detecting mutations now")
            self._run_program(all_fasta, bam_datas, args_snp)
            os.remove(self.outputs["tmp"])
            os.remove(all_fasta)
            os.remove(all_fasta + ".fai")
        self.helper.remove_tmp_dir(args_snp.fastas)
        self._remove_bams(bam_datas, args_snp)
Esempio n. 25
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if os.path.exists(self.folders["tmp_tar"]):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files, combine,
                         out_folder):
        pass
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"])
        print("Updating the reference sequences")
        mh = open(mut_table, "r")
        pre_strain = None
        out = None
        for row in csv.reader(mh, delimiter='\t'):
            strain = row[1]
            if not row[0].startswith("#"):
                if (pre_strain != row[1]):
                    fasta = os.path.join(out_folder, "fasta_files",
                                         strain + ".fa")
                    if out is not None:
                        out.close()
                    out = open(fasta, "w")
                    if strain + ".fa" in os.listdir(self.folders["tmp_tar"]):
                        with open(
                                os.path.join(self.folders["tmp_tar"],
                                             strain + ".fa")) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error: No fasta information of {0}.fa".format(
                            strain))
        out.close()
        if combine:
            out_seq = "updated_genomes.fa"
            if os.path.exists(out_seq):
                os.remove(out_seq)
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    os.system(" ".join([
                        "cat",
                        os.path.join(out_folder, "fasta_files", seq), ">>",
                        out_seq
                    ]))
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
            shutil.move(out_seq,
                        os.path.join(out_folder, "fasta_files", out_seq))
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        print("Please use the new fasta files to remapping again.")
Esempio n. 26
0
class TargetFasta(object):
    '''detection of sRNA target interaction'''
    def __init__(self, tar_folder, ref_folder):
        self.multiparser = Multiparser()
        self.seq_editer = SeqEditer()
        self.helper = Helper()
        self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")}

    def gen_folder(self, out_folder, ref_files):
        new_ref_folder = os.path.join(out_folder, "tmp_reference")
        self.helper.check_make_folder(new_ref_folder)
        for file_ in ref_files:
            shutil.copy(file_, new_ref_folder)
        self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp")
        self.multiparser.parser_fasta(new_ref_folder)
        if os.path.exists(os.path.join(out_folder, "fasta_files")):
            shutil.rmtree(os.path.join(out_folder, "fasta_files"))
            os.mkdir(os.path.join(out_folder, "fasta_files"))
        if os.path.exists(self.folders["tmp_tar"]):
            shutil.rmtree(self.folders["tmp_tar"])
        os.mkdir(self.folders["tmp_tar"])
        return new_ref_folder

    def get_target_fasta(self, mut_table, tar_folder, ref_files, out_name,
                         out_folder, log):
        new_ref_folder = self.gen_folder(out_folder, ref_files)
        log.write("Running seq_editor.py for updating sequence.\n")
        self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table,
                                   self.folders["tmp_tar"], out_name)
        print("Updating the reference sequences")
        mh = open(mut_table, "r")
        pre_strain = None
        out = None
        strain_num = 0
        for row in csv.reader(mh, delimiter='\t'):
            if not row[0].startswith("#"):
                if (pre_strain != row[0]):
                    strain_num = strain_num + 1
                    tmp_tar_name = "_".join([out_name, row[0]]) + ".fa"
                    fasta = os.path.join(out_folder, "fasta_files",
                                         tmp_tar_name)
                    if out is not None:
                        out.close()
                    out = open(fasta, "w")
                    if tmp_tar_name in os.listdir(self.folders["tmp_tar"]):
                        with open(
                                os.path.join(self.folders["tmp_tar"],
                                             tmp_tar_name)) as f_h:
                            for line in f_h:
                                out.write(line)
                    else:
                        print("Error: No updated information of {0}.fa".format(
                            row[0]))
                pre_strain = row[0]
        out.close()
        out_seq = out_name + ".fa"
        if os.path.exists(out_seq):
            os.remove(out_seq)
        if strain_num == 1:
            o_s = open(out_seq, "w")
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    with open(os.path.join(out_folder, "fasta_files",
                                           seq)) as t_h:
                        for line in t_h:
                            if len(line) != 0:
                                if line.startswith(">"):
                                    o_s.write(">" + out_name + "\n")
                                else:
                                    o_s.write(line)
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
            o_s.close()
        else:
            for seq in os.listdir(os.path.join(out_folder, "fasta_files")):
                if seq.endswith(".fa"):
                    os.system(" ".join([
                        "cat",
                        os.path.join(out_folder, "fasta_files", seq), ">>",
                        out_seq
                    ]))
                    os.remove(os.path.join(out_folder, "fasta_files", seq))
        shutil.move(out_seq, os.path.join(out_folder, "fasta_files", out_seq))
        shutil.rmtree(self.folders["tmp_tar"])
        shutil.rmtree(self.folders["tmp_ref"])
        if "tmp_reference" in os.listdir(out_folder):
            shutil.rmtree(new_ref_folder)
        log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) +
                  " is generated.\n")
        print("Please use the new fasta files to remapping again.")