def get_vcfs(input_bam_file, output_vcf_path): """ Usage: get_vcfs(input_bam_file, output_vcf_path) :param: input_bam_file - One BAM file to be parsed. :param: output_vcf_path - The location where the generated VCF file to be placed. :return: Number indicating whether get_vcfs was run successfully. 0 - ran successfully 10 - output path did not include a name """ if not os.path.exists(output_vcf_path): open(output_vcf_path, 'w').close() if not isinstance(input_bam_file, str) or \ input_bam_file == '' or \ not input_bam_file.endswith(".bam") or \ not os.path.isfile(input_bam_file) or \ not os.access(input_bam_file, os.R_OK) or \ not isinstance(output_vcf_path, str) or \ output_vcf_path == '' or \ not output_vcf_path.endswith(".vcf") or \ output_vcf_path.rfind('/') == -1 or \ not os.path.isdir(output_vcf_path[:output_vcf_path.rfind('/') + 1]) or \ output_vcf_path[output_vcf_path.rfind('/') + 1:] == '' or \ not all(c in ascii_letters + digits + '-' + '_' + '.' for c in \ output_vcf_path[output_vcf_path.rfind('/') + 1:]) or \ not os.access(output_vcf_path, os.W_OK): print( "Invalid Input or Output path supplied. The input file itself must exist and be readable and end with" ".bam. While the output file must be located in an existing file directory, and its name may not be " "empty and may only contain ASCII characters, numbers, ., -, and _. Moreover, the intended path musts be" "writable and must end with .vcf.") exit() return print("Obtaining the VCF file from %s" % input_bam_file[input_bam_file.rfind('/') + 1:]) check_container_status(input_bam_file[:input_bam_file.rfind('/') + 1]) subprocess.run( 'docker exec -ti bio_c sh -c "cd /bio/ && lumpyexpress -B %s -o %s -P"' % (input_bam_file[input_bam_file.rfind('/') + 1:], output_vcf_path[output_vcf_path.rfind('/') + 1:]), shell=True) shutil.move( input_bam_file[:input_bam_file.rfind('/') + 1] + output_vcf_path[output_vcf_path.rfind('/') + 1:], output_vcf_path) return 0 # #get_vcfs("/home/nathantaitano/Desktop/pimpiSVs/new_bams/EA00676.bam", "/home/nathantaitano/Desktop/pimpiSVs/new_bams/EA00676.vcf") #get_vcfs("/home/nathantaitano/Desktop/pimpiSVs/new_bams/BGV008037.bam", "/home/nathantaitano/Desktop/pimpiSVs/new_bams/BGV008037.vcf")
def merge_bam_list(bam_list, output_directory=None, accession=None, v=False, k=False): mount_directory = os.path.dirname(bam_list[0]) docker_tools.check_container_status(mount_directory) mount_directory += '/' if v: print("Fixing bam headers") fixed_header_bam_list = [] for bam_file in bam_list: bam_name = bam_file.split('/')[-1] if bam_file.endswith("_fixed_header.bam"): fixed_header_bam_list.append("/bio/" + bam_name) continue else: fixed_header_bam = fix_header("/bio/" + bam_name, accession=accession) fixed_header_bam_list.append(fixed_header_bam) if v: print("Merging bams") bam_file_string = ' '.join(fixed_header_bam_list) if accession is None: accession = bam_file_list[0].split('/')[-1][0:-17] #Gets rid of "_fixed_header.bam" merged_file = "/bio/" + str(accession) + "m.bam" subprocess.call('docker exec -it bio_c sh -c "samtools merge -c %s %s"' \ % (merged_file, bam_file_string), shell=True) if not k: if v: print("Deleting fixed header bam files") for file in fixed_header_bam_list: fixed_header_file = mount_directory + file[5:] if fixed_header_file not in bam_list: os.remove(fixed_header_file) if output_directory is not None: merged_bam_base_name = merged_file[4:] output_location = output_directory + merged_bam_base_name os.rename(mount_directory + merged_bam_base_name, output_location) return
def get_accession(bam_input, accession_file=None, q=False, v=False): mount_directory = os.path.dirname(bam_input) docker_tools.check_container_status(mount_directory) mount_directory += '/' bam_input_name = bam_input.split('/')[-1] already_parsed = False if not q: if accession_file is not None and accession_file.endswith(".txt"): if v: print("Checking if %s has already been parsed" % bam_input) open_accession_file = open(accession_file, 'r+') line = open_accession_file.readline() while line: bam_name = line.split(':')[0] if bam_input_name == bam_name: already_parsed = True line = open_accession_file.readline() open_accession_file.close() else: accession_file = mount_directory + "accession_file.txt" if os.path.exists(accession_file): if v: print("Checking if %s has already been parsed" % bam_input) open_accession_file = open(accession_file, 'r+') line = open_accession_file.readline() while line: bam_name = line.split(':')[0] if bam_input_name == bam_name: already_parsed = True line = open_accession_file.readline() open_accession_file.close() if already_parsed: print("\n%s has already been parsed. Its accession is in %s\n" % (bam_input, accession_file)) return 1 if v: print("Opening accession file at %s" % accession_file) accession_file_exists = os.path.exists(accession_file) open_accession_file = open(accession_file, "a+") #Prepare temp file info base_name = bam_input_name[0:-4] random_suffix = str(random.randint(100000, 999999)) temp_file_base_name = "temp" + base_name + '-' + random_suffix temp_file = mount_directory + temp_file_base_name if v: print("Creating temporary file at %s" % temp_file) open_temp_file = open(temp_file, 'x') open_temp_file.close() #Write input bam's header to temp file subprocess.call("docker exec -it bio_c sh -c 'samtools view -H %s > %s'" % ("/bio/" + bam_input_name, "/bio/" + temp_file_base_name), shell=True) if v: print("Parsing bam header for accession information") accession = None open_temp_file = open(temp_file, 'r') line = open_temp_file.readline() #Get to the @RG line where SM: is while line and not line.startswith("@RG"): line = open_temp_file.readline() try: SM_substring = line.split("SM:")[1] except: print( "\nAccession could not be found. BAM file header may be corrupt?\n" ) return 2 accession = SM_substring.split('\t')[0] open_temp_file.close() os.remove(temp_file) if not q: open_accession_file.write(bam_input_name + ':' + accession + '\n') open_accession_file.close() print("\nThe accession of %s is %s" % (bam_input, accession)) if not q: print("Accession written to %s\n" % accession_file) return (accession)
def unknown_N_filter(input_vcf, output_vcf, reference_genome, v=False, N_threshold=10, flank_rad=50): """ Filters a vcf based on the quality of the reference genome. An SV is filtered out if there are more than a certain amount of unknown nucleotide bases within a certain amount of base pairs of the SV's endpoints. For example, if we input the flank radius to be 50 and the N threshold to 10, then if an SV had 15 unknown nucleotides directly before and after its start position in the reference genome, then it would be filtered out. The nucleotide bases of the reference genome are found using the bedtools nuc command. Usage: python unknown_N_filter.py -i /vcf/to/be/filtered.vcf -f /reference/fasta/file.fa Parameters: Required: -i: vcf to be filtered -f: reference genome in fasta format Default: same directory as the input vcf, with _n_filtered.vcf suffix IMPORTANT: there must be an index file (.fai) for the reference genome in the same directory as the reference genome. It should be named like so: reference_genome_name.fa.fai Optional: -o: ouptut vcf Default: same directory as the input vcf with "_n_filtered" suffix -n: threshold for amount of N in order for an SV to be filtered Default: 10 -r: number of base pairs to fetch nucleotide pairs for Default: 50 -v: verbose mode -h: help (display this message) Returns a tuple containing a list of clean IDs and a list containing a list of dirty IDs """ mount_directory = os.path.dirname(reference_genome) docker_tools.check_container_status(mount_directory) mount_directory += '/' index_dict = {} reference_genome_index_file = reference_genome + ".fai" if v: print("Parsing reference genome index at %s" % reference_genome_index_file) try: open_index_file = open(reference_genome_index_file, 'r') except: raise Exception( "Cannot find index file (.fai) for reference genome in same directory as \ reference genome. Try running samtools faidx on the reference genome" ) line = open_index_file.readline() while line: cols = line.split('\t') chr_name = cols[0] chr_length = int(cols[1]) index_dict[chr_name] = chr_length line = open_index_file.readline() if v: print("Opening %s to create end point flanks" % input_vcf) vcf_reader = open(input_vcf, 'r') if v: print("Reading through the header") line = vcf_reader.readline() while line.startswith('#'): line = vcf_reader.readline() vcf_name = input_vcf.split('/')[-1][:-4] random_suffix = str(random.randint(100000, 999999)) bed_name = "temp" + vcf_name + '-' + random_suffix + ".bed" bed_file = mount_directory + bed_name if v: print("Creating temporary bed file at %s" % bed_file) bed_writer = open(bed_file, 'w+') while line: cols = line.split('\t') chr_name = cols[0] start_pos = int(cols[1]) - 1 sv_id = cols[2] sv_info = cols[7] if ";END=" in sv_info: # If it is any SV but BND end_pos = int(sv_info.split("END=")[1].split(';')[0]) else: end_pos = int(cols[1]) start_pos_minus_flank = start_pos - flank_rad start_pos_plus_flank = start_pos + flank_rad end_pos_minus_flank = end_pos - flank_rad end_pos_plus_flank = end_pos + flank_rad chr_length = index_dict[chr_name] if start_pos_minus_flank < 0: start_pos_minus_flank = 0 if end_pos_minus_flank < 0: end_pos_minus_flank = 0 if start_pos_plus_flank > chr_length: start_pos_plus_flank = chr_length if end_pos_plus_flank > chr_length: end_pos_plus_flank = chr_length start_pos_minus_flank = str(start_pos_minus_flank) start_pos_plus_flank = str(start_pos_plus_flank) end_pos_minus_flank = str(end_pos_minus_flank) end_pos_plus_flank = str(end_pos_plus_flank) bed_writer.write(chr_name + "\t" + start_pos_minus_flank + "\t" + start_pos_plus_flank + "\t" + sv_id + "\n") bed_writer.write(chr_name + "\t" + end_pos_minus_flank + "\t" + end_pos_plus_flank + "\t" + sv_id + "\n") line = vcf_reader.readline() if v: print("%s converted to BED format at %s" % (input_vcf, bed_file)) vcf_reader.close() bed_writer.close() fasta_name = reference_genome.split('/')[-1] nuc_name = "temp" + vcf_name + '-' + random_suffix + ".nuc" nuc_file = mount_directory + nuc_name if v: print("Generating nucleotide content at %s" % (nuc_file)) subprocess.call( 'docker exec -it bio_c sh -c "bedtools nuc -fi %s -bed %s > %s"' % ("/bio/" + fasta_name, "/bio/" + bed_name, "/bio/" + nuc_name), shell=True) os.remove(bed_file) if v: print("Parsing nucleotide content") open_nuc_file = open(nuc_file, 'r') open_nuc_file.readline() #Skip header line = open_nuc_file.readline() clean_IDs = [] bad_IDs = [] while line: cols = line.split('\t') id = cols[3] N_count = int(cols[10]) if N_count >= N_threshold: if id not in bad_IDs: bad_IDs.append(id) if id in clean_IDs: clean_IDs.remove(id) else: if id not in bad_IDs: if id not in clean_IDs: clean_IDs.append(id) line = open_nuc_file.readline() open_nuc_file.close() os.remove(nuc_file) if v: print("Finished parsing nuc content") print("Filtering vcf") vcf_reader = open(input_vcf, 'r') if v: print("Creating filtered vcf file at %s" % output_vcf) vcf_writer = open(output_vcf, 'w+') line = vcf_reader.readline() while line.startswith('#'): vcf_writer.write(line) line = vcf_reader.readline() while line: cols = line.split('\t') id = cols[2] if id in clean_IDs: vcf_writer.write(line) line = vcf_reader.readline() vcf_reader.close() vcf_writer.close() print("All SVs processed. Filtered vcf at %s" % output_vcf) return 0
subprocess.call( "docker exec -i bio_c sh -c 'cd /bio/ && /speedseq/bin/speedseq align -M 20 -R \"@RG\tID:%s\tSM:%s\tLB:%s\" -o %s %s %s_1.fastq.gz %s_2.fastq.gz'" % (accession, accession, accession, accession, "/bio/BGV1.0_genome.fasta", accession, accession), shell=True) # Lumpy express subprocess.call( "docker exec -i bio_c sh -c 'cd /bio/ && lumpyexpress -B %s -S %s -D %s'" % (accession + ".bam", accession + ".splitters.bam", accession + ".discordants.bam"), shell=True) return # Enter your own mount_directory here. This is where the bams will go mount_directory = "/home/nathantaitano/Desktop/pimpiSVs/new_bams" docker_tools.check_container_status(mount_directory) # Installs speedseq in the docker container subprocess.call( "docker exec -i bio_c sh -c 'git clone --recursive https://github.com/hall-lab/speedseq && cd speedseq && make align'", shell=True) for SRRname, accession in sraAccessionDict.items(): sraProcessFile(SRRname, accession) for EBIname, acession in ebiAccessionDict.items(): ebiProcessFile(EBIname, acession)