def write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader): """The dense snps are not filtered from outgroup samples. Instead, we copy the original vcf file to _preserved.vcf, and create an empty _removed.vcf. Parameters ---------- vcf_file_path : str Path to a sample VCF file. vcf_reader : PyVcf vcf.Reader Previously opened VCF reader object. """ preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: # close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) return vcf_writer_removed.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
def write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader): """The dense snps are not filtered from outgroup samples. Instead, we copy the original vcf file to _preserved.vcf, and create an empty _removed.vcf. Parameters ---------- vcf_file_path : str Path to a sample VCF file. vcf_reader : PyVcf vcf.Reader Previously opened VCF reader object. """ preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: # close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) return vcf_writer_removed.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
def create_snp_list(options_dict): """Create SNP list file Description: Create the SNP list -- the list of positions where variants were found and the corresponding list of samples having a variant at each position. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf ... snplist.txt (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files are used to construct the SNP position list. 3. The snplist.txt output file contains the union of the SNP positions and sample names extracted from all the var.flt.vcf files. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories snpListFile: File path (not just file name) of text format list of SNP positions Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'vcfFileName':'var.flt.vcf' 'snpListFile':'snplist.txt', } create_snp_list(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_path = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d] sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories) #========================================================================== # Validate inputs #========================================================================== snp_list_file_path = options_dict['snpListFile'] vcf_file_name = options_dict['vcfFileName'] list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories] bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) #========================================================================== # Read in all vcf files and process into dict of SNPs passing various # criteria. Do this for each sample. Write to file. #========================================================================== if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path): snp_dict = dict() excluded_sample_directories = set() for sample_dir, vcf_file_path in zip(sorted_list_of_sample_directories, list_of_vcf_files): if not os.path.isfile(vcf_file_path): continue if os.path.getsize(vcf_file_path) == 0: continue verbose_print("Processing VCF file %s" % vcf_file_path) sample_name = os.path.basename(os.path.dirname(vcf_file_path)) snp_set = utils.convert_vcf_file_to_snp_set(vcf_file_path) max_snps = options_dict['maxSnps'] if max_snps >= 0 and len(snp_set) > max_snps: verbose_print("Excluding sample %s having %d snps." % (sample_name, len(snp_set))) excluded_sample_directories.add(sample_dir) continue for key in snp_set: if key not in snp_dict: sample_list = [sample_name] snp_dict[key] = sample_list else: sample_list = snp_dict[key] sample_list.append(sample_name) verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files))) utils.write_list_of_snps(snp_list_file_path, snp_dict) verbose_print("") #========================================================================== # Write the filtered list of sample directories #========================================================================== sample_directories_list_path = sample_directories_list_path + ".filtered" with open(sample_directories_list_path, "w") as filtered_samples_file_object: # Loop over the unsorted list to keep the order of samples the same as the original. # This will keep the same HPC log file suffix number. for sample_dir in unsorted_list_of_sample_directories: if sample_dir not in excluded_sample_directories: filtered_samples_file_object.write("%s\n" % sample_dir) else: verbose_print("SNP list %s has already been freshly built. Use the -f option to force a rebuild." % snp_list_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def create_snp_matrix(options_dict): """Create SNP matrix Description: Create the SNP matrix containing the consensus base for each of the samples at the positions where SNPs were found in any of the samples. The matrix contains one row per sample and one column per SNP position. Non-SNP positions are not included in the matrix. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/consensus.fasta ... snpma.fasta (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The consensus.fasta input files are previously called consensus for each sample to construct the SNP matrix fasta file. 3. The snpma.fasta output file contains the SNP calls for each sequence, arranged as a multi-fasta file with one sequence per sample. The sampleDirectories.txt, and consensus.fasta are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile : str File path (not just file name) of file containing paths to directories containing consensus.fasta file for each sequence. snpListFile : str File path (not just file name) of text format list of SNP positions consFileName : str File name of the previously called consensus fasta files which must exist in each of the sample directories snpmaFile : str File path (not just file name) of the output snp matrix, formatted as a fasta file, with each sequence (all of identical length) corresponding to the SNPs in the correspondingly named sequence. Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'consFileName':'consensus.fasta', 'snpmaFile':'snpma.fasta', 'minConsFreq':0.6, } create_snp_matrix(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_filename = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_filename, "r") as sample_directories_list_file: list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] list_of_sample_directories = sorted([d for d in list_of_sample_directories if d]) #========================================================================== # Verify input consensus.fasta files exist #========================================================================== consensus_files = [] bad_file_count = 0 for sample_directory in list_of_sample_directories: consensus_file_path = os.path.join(sample_directory, options_dict['consFileName']) bad_count = utils.verify_non_empty_input_files("Consensus fasta file", [consensus_file_path]) if bad_count == 1: bad_file_count += 1 else: consensus_files.append(consensus_file_path) # keep the list of good files if bad_file_count == len(list_of_sample_directories): utils.global_error("Error: all %d consensus fasta files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d consensus fasta files were missing or empty." % bad_file_count, continue_possible=True) #========================================================================== # Check if the result is already fresh #========================================================================== snpma_file_path = options_dict['snpmaFile'] source_files = consensus_files if not options_dict['forceFlag']: if not utils.target_needs_rebuild(source_files, snpma_file_path): verbose_print("SNP matrix %s has already been freshly built. Use the -f option to force a rebuild." % snpma_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name())) return #========================================================================== # Create snp matrix. Write results to file. #========================================================================== with open(snpma_file_path, "w") as output_file: for consensus_file_path in consensus_files: verbose_print("Merging " + consensus_file_path) with open(consensus_file_path, "r") as input_file: for line in input_file: output_file.write(line) verbose_print("") verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def call_consensus(options_dict): """Call the consensus base for a sample Call the consensus base for a sample at the positions where SNPs were found in any of the samples. This function expects, or creates '(*)', the following files arranged in the following way: snplist.txt samples sample_name_one/reads.all.pileup sample_name_one/consensus.fasta (*) The files are used as follows: 1. The snplist.txt input file contains the list of SNP positions extracted from all the var.flt.vcf files combined. 2. The reads.all.pileup input file is a pileups at all positions used to determine the nucleotide base at each SNP position. 3. The consensus.fasta output file contains the SNP calls for each sequence, arranged as a fasta file with one sequence per sample. The snplist.txt, and reads.snp.pileup are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: forceFlag : boolean flag to force processing even when result file already exists and is newer than inputs snpListFile : str File path (not just file name) of text format list of SNP positions allPileupFile : str Relative or absolute path to the genome-wide pileup file for this sample consensusFile : str Output file. Relative or absolute path to the consensus fasta file for this sample. minBaseQual : int Mimimum base quality score to count a read. All other snp filters take effect after the low-quality reads are discarded. minConsFreq : float Consensus frequency. Mimimum fraction of high-quality reads supporting the consensus to make a call. minConsStrdDpth : int Consensus strand depth. Minimum number of high-quality reads supporting the consensus which must be present on both the forward and reverse strands to make a call. minConsStrdBias : float Strand bias. Minimum fraction of the high-quality consensus-supporting reads which must be present on both the forward and reverse strands to make a call. The numerator of this fraction is the number of high-quality consensus-supporting reads on one strand. The denominator is the total number of high-quality consensus-supporting reads on both strands combined. Raises: Examples: options_dict = {'snpListFile':'snplist.txt', 'allPileupFile':'reads.all.pileup', 'consensusFile':'consensus.fasta', 'minBaseQual':15, 'minConsFreq':0.6, 'minConsStrdDpth':4, 'minConsStrdBias':0.10, 'vcfFailedSnpGt':'.' } call_consensus(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) snp_list_file_path = options_dict['snpListFile'] all_pileup_file_path = options_dict['allPileupFile'] sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path)) sample_name = os.path.basename(sample_directory) consensus_file_path = options_dict['consensusFile'] consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path)) vcf_file_name = options_dict['vcfFileName'] vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot call consensus without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False) # Check if the result is already fresh source_files = [snp_list_file_path, all_pileup_file_path] if not options_dict['forceFlag'] and not utils.target_needs_rebuild(source_files, consensus_file_path): verbose_print("Consensus call file %s has already been freshly built. Use the -f option to force a rebuild." % consensus_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name())) return # Load the list of which positions to called snp_list = utils.read_snp_position_list(snp_list_file_path) snplist_length = len(snp_list) verbose_print("snp position list length = %d" % snplist_length) # Call consensus. Write results to file. position_consensus_base_dict = dict() caller = pileup.ConsensusCaller(options_dict['minConsFreq'], options_dict['minConsStrdDpth'], options_dict['minConsStrdBias']) snp_positions = set(snp_list) parse_positions = None if options_dict['vcfAllPos'] else snp_positions pileup_reader = pileup.Reader(all_pileup_file_path, options_dict['minBaseQual'], parse_positions) if vcf_file_name: writer = vcf_writer.SingleSampleWriter(vcf_file_path, options_dict['vcfPreserveRefCase']) filters = caller.get_filter_descriptions() writer.write_header(sample_name, filters, options_dict['vcfRefName']) for pileup_record in pileup_reader: chrom = pileup_record.chrom pos = pileup_record.position consensus_base, fail_reasons = caller.call_consensus(pileup_record) if (chrom, pos) in snp_positions: if fail_reasons: position_consensus_base_dict[(chrom, pos)] = '-' else: position_consensus_base_dict[(chrom, pos)] = consensus_base if vcf_file_name: writer.write_from_pileup(pileup_record, fail_reasons, options_dict['vcfFailedSnpGt']) if vcf_file_name: writer.close() verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict))) consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list] consensus_str = ''.join(consensus_list) snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="") # Write the consensus calls to a fasta file with open(consensus_file_path, "w") as fasta_file_object: SeqIO.write([snp_seq_record], fasta_file_object, "fasta") verbose_print("") verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def call_consensus(args): """Call the consensus base for a sample Call the consensus base for a sample at the positions where SNPs were found in any of the samples. This function expects, or creates '(*)', the following files arranged in the following way: snplist.txt samples sample_name_one/reads.all.pileup sample_name_one/consensus.fasta (*) The files are used as follows: 1. The snplist.txt input file contains the list of SNP positions extracted from all the var.flt.vcf files combined. 2. The reads.all.pileup input file is a pileups at all positions used to determine the nucleotide base at each SNP position. 3. The consensus.fasta output file contains the SNP calls for each sequence, arranged as a fasta file with one sequence per sample. The snplist.txt, and reads.all.pileup are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : namespace forceFlag : boolean flag to force processing even when result file already exists and is newer than inputs snpListFile : str File path (not just file name) of text format list of SNP positions excludeFile : str File path of VCF file of positions to exclude from the snp matrix. allPileupFile : str Relative or absolute path to the genome-wide pileup file for this sample consensusFile : str Output file. Relative or absolute path to the consensus fasta file for this sample. minBaseQual : int Mimimum base quality score to count a read. All other snp filters take effect after the low-quality reads are discarded. minConsFreq : float Consensus frequency. Mimimum fraction of high-quality reads supporting the consensus to make a call. minConsStrdDpth : int Consensus strand depth. Minimum number of high-quality reads supporting the consensus which must be present on both the forward and reverse strands to make a call. minConsStrdBias : float Strand bias. Minimum fraction of the high-quality consensus-supporting reads which must be present on both the forward and reverse strands to make a call. The numerator of this fraction is the number of high-quality consensus-supporting reads on one strand. The denominator is the total number of high-quality consensus-supporting reads on both strands combined. Raises: Examples: args = argparse.Namespace args.snpListFile = 'snplist.txt' args.allPileupFile = 'reads.all.pileup' args.consensusFile = 'consensus.fasta' args.minBaseQual = 15 args.minConsFreq = 0.6 args.minConsStrdDpth = 4 args.minConsStrdBias = 0.10 args.vcfFailedSnpGt = '.' call_consensus(args) """ utils.print_log_header() utils.print_arguments(args) snp_list_file_path = args.snpListFile all_pileup_file_path = args.allPileupFile sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path)) sample_name = os.path.basename(sample_directory) consensus_file_path = args.consensusFile consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path)) vcf_file_name = args.vcfFileName vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot call consensus without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False) source_files = [snp_list_file_path, all_pileup_file_path] exclude_file_path = args.excludeFile if exclude_file_path: bad_file_count = utils.verify_existing_input_files("Exclude file", [exclude_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the file of excluded positions.", continue_possible=False) excluded_positions = utils.convert_vcf_file_to_snp_set(exclude_file_path) source_files.append(exclude_file_path) else: excluded_positions = set() # Check if the result is already fresh if not args.forceFlag and not utils.target_needs_rebuild(source_files, consensus_file_path): utils.verbose_print("Consensus call file %s has already been freshly built. Use the -f option to force a rebuild." % consensus_file_path) return # Load the list of which positions to called snp_list = utils.read_snp_position_list(snp_list_file_path) snplist_length = len(snp_list) utils.verbose_print("snp position list length = %d" % snplist_length) utils.verbose_print("excluded snps list length = %d" % len(excluded_positions)) utils.verbose_print("total snp position list length = %d" % (snplist_length + len(excluded_positions))) # Call consensus. Write results to file. position_consensus_base_dict = dict() caller = pileup.ConsensusCaller(args.minConsFreq, args.minConsStrdDpth, args.minConsStrdBias) snp_positions = set(snp_list) if args.vcfAllPos: parse_positions = None else: parse_positions = snp_positions.union(excluded_positions) pileup_reader = pileup.Reader(all_pileup_file_path, args.minBaseQual, parse_positions) if vcf_file_name: writer = vcf_writer.SingleSampleWriter(vcf_file_path, args.vcfPreserveRefCase) filters = caller.get_filter_descriptions() # TODO: it would be better if the exclude file contained filter headers we could read and re-use here instead of hard-coding this filters.append(("Region", "Position is in dense region of snps or near the end of the contig.")) writer.write_header(sample_name, filters, args.vcfRefName) for pileup_record in pileup_reader: chrom = pileup_record.chrom pos = pileup_record.position consensus_base, fail_reasons = caller.call_consensus(pileup_record) if (chrom, pos) in excluded_positions: # TODO: it would be better if the exclude file contained filter reasons we could re-use here instead of hard coding this fail_reasons = fail_reasons or [] fail_reasons.append("Region") if (chrom, pos) in snp_positions: if fail_reasons: position_consensus_base_dict[(chrom, pos)] = '-' else: position_consensus_base_dict[(chrom, pos)] = consensus_base if vcf_file_name: writer.write_from_pileup(pileup_record, fail_reasons, args.vcfFailedSnpGt) if vcf_file_name: writer.close() utils.verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict))) consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list] consensus_str = ''.join(consensus_list) snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="") # Write the consensus calls to a fasta file with open(consensus_file_path, "w") as fasta_file_object: SeqIO.write([snp_seq_record], fasta_file_object, "fasta")
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate some parameters #========================================================================== edge_length = args.edgeLength window_size = args.windowSize max_num_snp = args.maxSNP #========================================================================== # Prep work #========================================================================== sample_directories_list_path = args.sampleDirsFile bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d] sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories) input_file_list = list() out_group_list_path = args.outGroupFile sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: #There are outgroup samples input_file_list.append(out_group_list_path) with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file] sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples) except: utils.global_error("Error: Cannot open the file containing the list of outgroup samples!") #========================================================================== # Validate inputs #========================================================================== vcf_file_name = args.vcfFileName list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories] input_file_list.extend(list_of_vcf_files) bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files("Reference file", [args.refFastaFile]) if bad_file_count > 0: utils.global_error(None) #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(args.refFastaFile, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): #build contig_length_dict contig_length_dict[record.id] = len(record.seq) input_file_list.append(args.refFastaFile) except: utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.") else: if handle: handle.close() #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" preserved_needs_rebuild = utils.target_needs_rebuild(input_file_list, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild(input_file_list, removed_vcf_file_path) need_rebuild_dict[vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print("All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild.") return #========================================================================== # Find all bad regions. #========================================================================== bad_regions_dict = dict() # Key is the contig ID, and the value is a list of bad regions. for vcf_file_path in list_of_vcf_files: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID in sorted_list_of_outgroup_samples: if not need_rebuild_dict[vcf_file_path]: vcf_reader_handle.close() continue #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue vcf_writer_removed.close() vcf_reader_handle.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path) else: #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. snp_dict = defaultdict(list) for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. record = (vcf_data_line.POS, vcf_data_line) snp_dict[vcf_data_line.CHROM].append(record) #Find bad regions and add them into bad_region for contig, snp_list in snp_dict.items(): #sort all SNPs in this contig by position sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0]) #total number of SNPs num_of_snp = len(sorted_list) if contig not in bad_regions_dict: #New contig try: contig_length = contig_length_dict[contig] except: #cannot find contig length. Use the sys.maxsize. contig_length = sys.maxsize if (contig_length <= (edge_length * 2)): bad_regions_dict[contig] = [(0, contig_length)] else: region = [(0, edge_length), (contig_length - edge_length, contig_length)] bad_regions_dict[contig] = region #Process SNPs for idx, snp in enumerate(sorted_list): if (idx + max_num_snp) < num_of_snp: pos_start = snp[0] pos_end = sorted_list[idx + max_num_snp][0] if (pos_start + window_size) >= pos_end: #Add bad region regions = bad_regions_dict[contig] temp_region = (pos_start, pos_end) regions.append(temp_region) vcf_reader_handle.close() #Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): sorted_regions = utils.sort_coord(regions) combined_regions = utils.consensus(sorted_regions) bad_regions_dict[contig] = combined_regions #Scan vcf files to remove SNPs for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID not in sorted_list_of_outgroup_samples: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) continue try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): #Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: #Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. acrossSamples: Dense regions found in any sample are filtered from all samples. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Get arguments from Argparse namespace #========================================================================== sample_directories_list_path = args.sampleDirsFile ref_fasta_path = args.refFastaFile force_flag = args.forceFlag vcf_file_name = args.vcfFileName edge_length = args.edgeLength window_size_list = args.windowSizeList max_num_snps_list = args.maxSnpsList out_group_list_path = args.outGroupFile filter_across_samples = args.acrossSamples #========================================================================== # Validate inputs #========================================================================== bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d] sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories) list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories] bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files("Reference file", [ref_fasta_path]) if bad_file_count > 0: utils.global_error(None) sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: # There are outgroup samples with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file] sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples) except: utils.global_error("Error: Cannot open the file containing the list of outgroup samples!") #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(ref_fasta_path, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): # build contig_length_dict contig_length_dict[record.id] = len(record.seq) except: utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.") else: if handle: handle.close() #========================================================================== # Filter regions #========================================================================== if filter_across_samples: filter_regions_across_samples(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path) else: filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
def write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict): """Given a VCF file and a collection of abnormal regions, scan the snps in the VCF file and write each snps to either the preserved or removed output VCF file. Parameters ---------- vcf_file_path : str Path to a sample VCF file. bad_regions_dict : dict Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). """ try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) return # SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) return try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: # close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) return for vcf_data_line in vcf_reader: # Create a dict to store all SNPs in this sample # get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): # Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: # Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path): """Detect abnormal regions in each sample and filter those regions from all samples. Parameters ---------- list_of_vcf_files : list of str List of input VCF file paths -- one per sample. contig_length_dict : dict, str --> int Mapping of contig id to int length of contig. sorted_list_of_outgroup_samples : list of str List of sample IDs for samples that are outgroup samples. force_flag : bool Force processing even when result files already exist and are newer than inputs. edge_length : int The length of the edge regions in a contig, in which all SNPs will be removed. window_size_list : list of int The length of the window in which the number of SNPs should be no more than max_num_snp. max_num_snps_list : list of int The maximum number of SNPs allowed in a window. This list has the same size as window_size_list and the entries correspond to one another. ref_fasta_path : str Path to the reference fasta file. out_group_list_path : str Path to the file indicating outgroup samples, one sample ID per line. """ #========================================================================== # Prep work #========================================================================== input_file_list = list() input_file_list.append(ref_fasta_path) if out_group_list_path: input_file_list.append(out_group_list_path) #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" input_files = input_file_list + [vcf_file_path] preserved_needs_rebuild = utils.target_needs_rebuild(input_files, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild(input_files, removed_vcf_file_path) need_rebuild_dict[vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print("All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild.") return #========================================================================== # Find all bad regions in one sample at a time #========================================================================== for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue sample_ID = utils.sample_id_from_file(vcf_file_path) utils.verbose_print("Processing sample %s" % sample_ID) if sample_ID in sorted_list_of_outgroup_samples: write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader) else: # The bad_regions_dict holds the bad regions for this sample # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). bad_regions_dict = dict() collect_dense_regions(vcf_reader, bad_regions_dict, contig_length_dict, edge_length, max_num_snps_list, window_size_list) # Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): combined_regions = utils.merge_regions(regions) bad_regions_dict[contig] = combined_regions # Write the output files write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict) vcf_reader_handle.close()
def create_snp_list(options_dict): """Create SNP list file Description: Create the SNP list -- the list of positions where variants were found and the corresponding list of samples having a variant at each position. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf ... snplist.txt (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files are used to construct the SNP position list. 3. The snplist.txt output file contains the union of the SNP positions and sample names extracted from all the var.flt.vcf files. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories snpListFile: File path (not just file name) of text format list of SNP positions Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'vcfFileName':'var.flt.vcf' 'snpListFile':'snplist.txt', } create_snp_list(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_filename = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_filename, "r") as sample_directories_list_file: list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] list_of_sample_directories = sorted([d for d in list_of_sample_directories if d]) #========================================================================== # Read in all vcf files and process into dict of SNPs passing various # criteria. Do this for each sample. Write to file. #========================================================================== snp_list_file_path = options_dict['snpListFile'] vcf_file_name = options_dict['vcfFileName'] list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in list_of_sample_directories] bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path): snp_dict = utils.convert_vcf_files_to_snp_dict(list_of_vcf_files) verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files))) utils.write_list_of_snps(snp_list_file_path, snp_dict) verbose_print("") else: verbose_print("SNP list %s has already been freshly built. Use the -f option to force a rebuild." % snp_list_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. mode: all = Dense regions found in any sample are filtered from all samples. each = Dense regions found in any sample are filtered independently from samples. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Get arguments from Argparse namespace #========================================================================== sample_directories_list_path = args.sampleDirsFile ref_fasta_path = args.refFastaFile force_flag = args.forceFlag vcf_file_name = args.vcfFileName edge_length = args.edgeLength window_size_list = args.windowSizeList max_num_snps_list = args.maxSnpsList out_group_list_path = args.outGroupFile filter_across_samples = args.mode == "all" #========================================================================== # Validate inputs #========================================================================== bad_file_count = utils.verify_non_empty_input_files( "File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [ line.rstrip() for line in sample_directories_list_file ] unsorted_list_of_sample_directories = [ d for d in unsorted_list_of_sample_directories if d ] sorted_list_of_sample_directories = sorted( unsorted_list_of_sample_directories) list_of_vcf_files = [ os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories ] bad_file_count = utils.verify_non_empty_input_files( "VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files( "Reference file", [ref_fasta_path]) if bad_file_count > 0: utils.global_error(None) sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files( "File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: # There are outgroup samples with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [ line.rstrip() for line in out_group_list_file ] sorted_list_of_outgroup_samples = sorted( unsorted_list_of_outgroup_samples) except: utils.global_error( "Error: Cannot open the file containing the list of outgroup samples!" ) #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(ref_fasta_path, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): # build contig_length_dict contig_length_dict[record.id] = len(record.seq) except: utils.global_error( "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file." ) else: if handle: handle.close() #========================================================================== # Filter regions #========================================================================== if filter_across_samples: filter_regions_across_samples(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path) else: filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
def write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict): """Given a VCF file and a collection of abnormal regions, scan the snps in the VCF file and write each snps to either the preserved or removed output VCF file. Parameters ---------- vcf_file_path : str Path to a sample VCF file. bad_regions_dict : dict Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). """ try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) return # SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) return try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: # close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) return for vcf_data_line in vcf_reader: # Create a dict to store all SNPs in this sample # get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): # Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: # Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path): """Detect abnormal regions in each sample and filter those regions from all samples. Parameters ---------- list_of_vcf_files : list of str List of input VCF file paths -- one per sample. contig_length_dict : dict, str --> int Mapping of contig id to int length of contig. sorted_list_of_outgroup_samples : list of str List of sample IDs for samples that are outgroup samples. force_flag : bool Force processing even when result files already exist and are newer than inputs. edge_length : int The length of the edge regions in a contig, in which all SNPs will be removed. window_size_list : list of int The length of the window in which the number of SNPs should be no more than max_num_snp. max_num_snps_list : list of int The maximum number of SNPs allowed in a window. This list has the same size as window_size_list and the entries correspond to one another. ref_fasta_path : str Path to the reference fasta file. out_group_list_path : str Path to the file indicating outgroup samples, one sample ID per line. """ #========================================================================== # Prep work #========================================================================== input_file_list = list() input_file_list.append(ref_fasta_path) if out_group_list_path: input_file_list.append(out_group_list_path) #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild only for that sample. # A missing output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" input_files = input_file_list + [vcf_file_path] preserved_needs_rebuild = utils.target_needs_rebuild( input_files, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild( input_files, removed_vcf_file_path) need_rebuild_dict[ vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print( "All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild." ) return #========================================================================== # Find all bad regions in one sample at a time #========================================================================== for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue sample_ID = utils.sample_id_from_file(vcf_file_path) utils.verbose_print("Processing sample %s" % sample_ID) if sample_ID in sorted_list_of_outgroup_samples: write_outgroup_preserved_and_removed_vcf_files( vcf_file_path, vcf_reader) else: # The bad_regions_dict holds the bad regions for this sample # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). bad_regions_dict = dict() collect_dense_regions(vcf_reader, bad_regions_dict, contig_length_dict, edge_length, max_num_snps_list, window_size_list) # Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): combined_regions = utils.merge_regions(regions) bad_regions_dict[contig] = combined_regions # Write the output files write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict) vcf_reader_handle.close()
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate some parameters #========================================================================== edge_length = args.edgeLength window_size = args.windowSize max_num_snp = args.maxSNP #========================================================================== # Prep work #========================================================================== sample_directories_list_path = args.sampleDirsFile bad_file_count = utils.verify_non_empty_input_files( "File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [ line.rstrip() for line in sample_directories_list_file ] unsorted_list_of_sample_directories = [ d for d in unsorted_list_of_sample_directories if d ] sorted_list_of_sample_directories = sorted( unsorted_list_of_sample_directories) input_file_list = list() out_group_list_path = args.outGroupFile sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files( "File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: #There are outgroup samples input_file_list.append(out_group_list_path) with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [ line.rstrip() for line in out_group_list_file ] sorted_list_of_outgroup_samples = sorted( unsorted_list_of_outgroup_samples) except: utils.global_error( "Error: Cannot open the file containing the list of outgroup samples!" ) #========================================================================== # Validate inputs #========================================================================== vcf_file_name = args.vcfFileName list_of_vcf_files = [ os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories ] input_file_list.extend(list_of_vcf_files) bad_file_count = utils.verify_non_empty_input_files( "VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files( "Reference file", [args.refFastaFile]) if bad_file_count > 0: utils.global_error(None) #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(args.refFastaFile, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): #build contig_length_dict contig_length_dict[record.id] = len(record.seq) input_file_list.append(args.refFastaFile) except: utils.global_error( "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file." ) else: if handle: handle.close() #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" preserved_needs_rebuild = utils.target_needs_rebuild( input_file_list, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild( input_file_list, removed_vcf_file_path) need_rebuild_dict[ vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print( "All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild." ) return #========================================================================== # Find all bad regions. #========================================================================== bad_regions_dict = dict( ) # Key is the contig ID, and the value is a list of bad regions. for vcf_file_path in list_of_vcf_files: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID in sorted_list_of_outgroup_samples: if not need_rebuild_dict[vcf_file_path]: vcf_reader_handle.close() continue #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer( open(removed_vcf_file_path, 'w'), vcf_reader) except: #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue vcf_writer_removed.close() vcf_reader_handle.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path) else: #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. snp_dict = defaultdict(list) for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. record = (vcf_data_line.POS, vcf_data_line) snp_dict[vcf_data_line.CHROM].append(record) #Find bad regions and add them into bad_region for contig, snp_list in snp_dict.items(): #sort all SNPs in this contig by position sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0]) #total number of SNPs num_of_snp = len(sorted_list) if contig not in bad_regions_dict: #New contig try: contig_length = contig_length_dict[contig] except: #cannot find contig length. Use the sys.maxsize. contig_length = sys.maxsize if (contig_length <= (edge_length * 2)): bad_regions_dict[contig] = [(0, contig_length)] else: region = [(0, edge_length), (contig_length - edge_length, contig_length)] bad_regions_dict[contig] = region #Process SNPs for idx, snp in enumerate(sorted_list): if (idx + max_num_snp) < num_of_snp: pos_start = snp[0] pos_end = sorted_list[idx + max_num_snp][0] if (pos_start + window_size) >= pos_end: #Add bad region regions = bad_regions_dict[contig] temp_region = (pos_start, pos_end) regions.append(temp_region) vcf_reader_handle.close() #Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): sorted_regions = utils.sort_coord(regions) combined_regions = utils.consensus(sorted_regions) bad_regions_dict[contig] = combined_regions #Scan vcf files to remove SNPs for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID not in sorted_list_of_outgroup_samples: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error( "Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer( open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) continue try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer( open(removed_vcf_file_path, 'w'), vcf_reader) except: #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): #Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: #Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()