def convert_sort(self, sort_gene_origin): """ Function creates a sorted and indexed bam file from given bam file Parameters ---------- sort_gene_origin: str Bam file's name that created by mapping algorithm """ if self.map_type == "Novoalign": convert_sort = self.get_paths.novoalign + "novosort -m 16g -t . -c " + self.threads + " --removeduplicates --keeptags " + \ sort_gene_origin + " -i -o SortedBAM_" + sort_gene_origin log_command(convert_sort, "Convert Sort", self.threads, "Mapping") self.file_list.append("SortedBAM_" + sort_gene_origin) self.file_list.append("SortedBAM_" + sort_gene_origin + ".bai") else: convert_sort = "samtools view -@" + self.threads + " -bS " + sort_gene_origin + " | samtools sort -@" + \ self.threads + " -o SortedBAM_" + sort_gene_origin log_command(convert_sort, "Convert Sort", self.threads, "Mapping") self.file_list.append("SortedBAM_" + sort_gene_origin) indexed = helpers.create_index("SortedBAM_" + sort_gene_origin, "Create Index", self.threads, "Mapping") self.file_list.append(indexed)
def gatk4_applybsqr(self, lastbam, recaltable): afterbqsrbam = "GATK4_" + lastbam apply_command = self.get_paths.gatk4_path + " ApplyBQSR -R " + self.bundle_dir + "Homo_sapiens_assembly38.fasta -I " + \ lastbam + " --bqsr-recal-file " + recaltable + " -O " + afterbqsrbam log_command(apply_command, "ApplyBQSR", self.threads, "Gatk4PreProcessing") self.file_list.append(afterbqsrbam) indexed = helpers.create_index(afterbqsrbam, "Create Index by GATK_ApplyBSQR", self.threads, "GatkPreProcess") self.file_list.append(indexed)
def gatk3_print_reads(self, lastbam, bqsr): nct = " -nct " + str(self.threads) aftercalibratorBam = "GATK_PR" + lastbam bcal = "java -jar " + self.get_paths.gatk_path + nct + " -T PrintReads -R " + self.bundle_dir + \ "/ucsc.hg19.fasta -I " + lastbam + " --BQSR " + bqsr + " -o " + aftercalibratorBam log_command(bcal, "Print Reads", self.threads, "GatkPreProcessing") self.file_list.append(aftercalibratorBam) indexed = helpers.create_index(aftercalibratorBam, "Create Index by GATK_PrintReads", self.threads, "GatkPreProcess") self.file_list.append(indexed)
def convert_sort(self, sort_gene_origin): """ Function creates a sorted and indexed bam file from given bam file Parameters ---------- sort_gene_origin: str Bam file's name that created by mapping algorithm """ convert_sort = "samtools view -@" + self.threads + " -bS " + sort_gene_origin + " | samtools sort -@" + \ self.threads + " -o SortedBAM_" + sort_gene_origin log_command(convert_sort, "Convert Sort", self.threads, "Mapping") self.file_list.append("SortedBAM_" + sort_gene_origin) indexed = helpers.create_index("SortedBAM_" + sort_gene_origin, "Create Index", self.threads, "Mapping") self.file_list.append(indexed)
def pre_process(self, info_dict, all_bam_files): if self.split_chr == "After": merged_file = self.merge_bams(info_dict, all_bam_files) self.file_list.append(merged_file) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(indexed) splitted_files = split_bam_by_chr(merged_file) for splitted_file in splitted_files: index_start = splitted_file.find("_Chr_") chr_a = splitted_file[index_start:] mark_duplicate_file = self.mark_duplicate(splitted_file, chr_a) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return_files = [a for a in self.file_list if "MDUP" in a and "bam" in a] return return_files elif self.split_chr == "Before": for bam_file in all_bam_files: splitted_files = split_bam_by_chr(bam_file) all_chr_files = get_bam_by_chr() print("preprocess line 128" ) print( all_chr_files) for i in all_chr_files: merged_file = self.merge_bams(info_dict, all_chr_files[i]) self.file_list.append(merged_file) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(indexed) index_start = all_chr_files[i][0].find("_Chr_") chr_a = all_chr_files[i][0][index_start:] mark_duplicate_file = self.mark_duplicate(merged_file, chr_a) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return_files = [a for a in self.file_list if "MDUP" in a and "bam" in a] return return_files # self.split_chr == "No": else: if self.map_type == "Novoalign": mark_duplicate_file = self.novoalign_sort_markduplicate(info_dict, all_bam_files) #self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return mark_duplicate_file merged_file = self.merge_bams(info_dict, all_bam_files) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(merged_file) self.file_list.append(indexed) mark_duplicate_file = self.mark_duplicate(merged_file,"") print("preprocess mark duplicate file " ) print(mark_duplicate_file) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return mark_duplicate_file # if __name__ == "__main__": # pre_processing_step = PreProcessing(working_directory="/home/bioinformaticslab/Desktop/GitHub_Repos/Genomics_Pipeline_Test/test_files", # map_type="Bwa", sample_type="Tumor", library_matching_id="203", thrds="1", issplitchr="Before") # # mapping_step = mapping.Mapping(working_directory=pre_processing_step.main_directory, # map_type="Bwa", sample_type="Tumor", library_matching_id="203", thrds="3") # # fastq_list = mapping_step.get_fastq() # info_dict = mapping_step.get_info(fastq_list) # os.chdir(pre_processing_step.working_directory) # bam_files = glob.glob("SortedBAM*.bam") # mark_duplicate_file = pre_processing_step.pre_process(info_dict, bam_files) # print(mark_duplicate_file)
def pre_process(self, info_dict, all_bam_files): if self.split_chr == "After": merged_file = self.merge_bams(info_dict, all_bam_files) self.file_list.append(merged_file) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(indexed) splitted_files = split_bam_by_chr(merged_file) for splitted_file in splitted_files: index_start = splitted_file.find("_Chr_") chr_a = splitted_file[index_start:] mark_duplicate_file = self.mark_duplicate(splitted_file, chr_a) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index( mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return_files = [ a for a in self.file_list if "MDUP" in a and "bam" in a ] return return_files elif self.split_chr == "Before": for bam_file in all_bam_files: splitted_files = split_bam_by_chr(bam_file) all_chr_files = get_bam_by_chr() print("preprocess line 128") print(all_chr_files) for i in all_chr_files: merged_file = self.merge_bams(info_dict, all_chr_files[i]) self.file_list.append(merged_file) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(indexed) index_start = all_chr_files[i][0].find("_Chr_") chr_a = all_chr_files[i][0][index_start:] mark_duplicate_file = self.mark_duplicate(merged_file, chr_a) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index( mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return_files = [ a for a in self.file_list if "MDUP" in a and "bam" in a ] return return_files # self.split_chr == "No": else: merged_file = self.merge_bams(info_dict, all_bam_files) indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing") self.file_list.append(merged_file) self.file_list.append(indexed) mark_duplicate_file = self.mark_duplicate(merged_file, "") print("preprocess mark duplicate file ") print(mark_duplicate_file) self.file_list.append(mark_duplicate_file) indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads, "Pre Processing") self.file_list.append(indexed) helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess", folder_directory=self.folder_directory) return mark_duplicate_file