def write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader):
    """The dense snps are not filtered from outgroup samples.  Instead, we
    copy the original vcf file to _preserved.vcf, and create an empty _removed.vcf.

    Parameters
    ----------
    vcf_file_path : str
        Path to a sample VCF file.
    vcf_reader : PyVcf vcf.Reader
        Previously opened VCF reader object.
    """
    preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
    removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

    try:
        vcf_writer_removed = None
        vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
    except:
        # close vcf_writer_reserved and remove the file reserved_vcf_file_path
        if vcf_writer_removed is not None:
            vcf_writer_removed.close()
        os.remove(removed_vcf_file_path)
        utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
        return

    vcf_writer_removed.close()
    shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
Beispiel #2
0
def write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader):
    """The dense snps are not filtered from outgroup samples.  Instead, we
    copy the original vcf file to _preserved.vcf, and create an empty _removed.vcf.

    Parameters
    ----------
    vcf_file_path : str
        Path to a sample VCF file.
    vcf_reader : PyVcf vcf.Reader
        Previously opened VCF reader object.
    """
    preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
    removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

    try:
        vcf_writer_removed = None
        vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'),
                                        vcf_reader)
    except:
        # close vcf_writer_reserved and remove the file reserved_vcf_file_path
        if vcf_writer_removed is not None:
            vcf_writer_removed.close()
        os.remove(removed_vcf_file_path)
        utils.sample_error(
            "Error: Cannot create the file for removed SNPs: %s." %
            removed_vcf_file_path,
            continue_possible=True)
        return

    vcf_writer_removed.close()
    shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
Beispiel #3
0
def create_snp_list(options_dict):
    """Create SNP list file

    Description:
    Create the SNP list -- the list of positions where variants were found
    and the corresponding list of samples having a variant at each position.
    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                ...
            snplist.txt (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files are used to construct the
           SNP position list.
        3. The snplist.txt output file contains the union of the SNP positions
           and sample names extracted from all the var.flt.vcf files.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        snpListFile: File path (not just file name) of text format list
            of SNP positions

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'vcfFileName':'var.flt.vcf'
                    'snpListFile':'snplist.txt',
                   }
    create_snp_list(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    #==========================================================================
    # Validate inputs
    #==========================================================================
    snp_list_file_path = options_dict['snpListFile']
    vcf_file_name = options_dict['vcfFileName']
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    #==========================================================================
    # Read in all vcf files and process into dict of SNPs passing various
    # criteria. Do this for each sample. Write to file.
    #==========================================================================
    if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path):
        snp_dict = dict()
        excluded_sample_directories = set()
        for sample_dir, vcf_file_path in zip(sorted_list_of_sample_directories, list_of_vcf_files):

            if not os.path.isfile(vcf_file_path):
                continue
            if os.path.getsize(vcf_file_path) == 0:
                continue

            verbose_print("Processing VCF file %s" % vcf_file_path)
            sample_name = os.path.basename(os.path.dirname(vcf_file_path))
            snp_set = utils.convert_vcf_file_to_snp_set(vcf_file_path)
            max_snps = options_dict['maxSnps']
            if max_snps >= 0 and len(snp_set) > max_snps:
                verbose_print("Excluding sample %s having %d snps." % (sample_name, len(snp_set)))
                excluded_sample_directories.add(sample_dir)
                continue

            for key in snp_set:
                if key not in snp_dict:
                    sample_list = [sample_name]
                    snp_dict[key] = sample_list
                else:
                    sample_list = snp_dict[key]
                    sample_list.append(sample_name)

        verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files)))
        utils.write_list_of_snps(snp_list_file_path, snp_dict)
        verbose_print("")

        #==========================================================================
        # Write the filtered list of sample directories
        #==========================================================================
        sample_directories_list_path = sample_directories_list_path + ".filtered"
        with open(sample_directories_list_path, "w") as filtered_samples_file_object:
            # Loop over the unsorted list to keep the order of samples the same as the original.
            # This will keep the same HPC log file suffix number.
            for sample_dir in unsorted_list_of_sample_directories:
                if sample_dir not in excluded_sample_directories:
                    filtered_samples_file_object.write("%s\n" % sample_dir)
    else:
        verbose_print("SNP list %s has already been freshly built.  Use the -f option to force a rebuild." % snp_list_file_path)
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Beispiel #4
0
def create_snp_matrix(options_dict):
    """Create SNP matrix

    Description:
    Create the SNP matrix containing the consensus base for each of the samples
    at the positions where SNPs were found in any of the samples.  The matrix
    contains one row per sample and one column per SNP position.  Non-SNP
    positions are not included in the matrix.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/consensus.fasta
                ...
            snpma.fasta (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The consensus.fasta input files are previously called consensus
           for each sample to construct the SNP matrix fasta file.
        3. The snpma.fasta output file contains the SNP calls for each
           sequence, arranged as a multi-fasta file with one sequence per
           sample.

    The sampleDirectories.txt, and consensus.fasta are created outside of this
        function. The package documentation provides an example of creating
        these files based on the lambda_virus sequence that is used as one
        test for this package.

    Args:
        sampleDirsFile : str
            File path (not just file name) of file containing paths
            to directories containing consensus.fasta file for each sequence.
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        consFileName : str
            File name of the previously called consensus fasta files which must
            exist in each of the sample directories
        snpmaFile : str
            File path (not just file name) of the output snp matrix, formatted
            as a fasta file, with each sequence (all of identical length)
            corresponding to the SNPs in the correspondingly named sequence.

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'consFileName':'consensus.fasta',
                    'snpmaFile':'snpma.fasta',
                    'minConsFreq':0.6,
                   }
    create_snp_matrix(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_filename = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_filename, "r") as sample_directories_list_file:
        list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    list_of_sample_directories = sorted([d for d in list_of_sample_directories if d])

    #==========================================================================
    # Verify input consensus.fasta files exist
    #==========================================================================
    consensus_files = []
    bad_file_count = 0
    for sample_directory in list_of_sample_directories:
        consensus_file_path = os.path.join(sample_directory, options_dict['consFileName'])
        bad_count = utils.verify_non_empty_input_files("Consensus fasta file", [consensus_file_path])
        if bad_count == 1:
            bad_file_count += 1
        else:
            consensus_files.append(consensus_file_path)  # keep the list of good files

    if bad_file_count == len(list_of_sample_directories):
        utils.global_error("Error: all %d consensus fasta files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d consensus fasta files were missing or empty." % bad_file_count, continue_possible=True)

    #==========================================================================
    # Check if the result is already fresh
    #==========================================================================
    snpma_file_path = options_dict['snpmaFile']
    source_files = consensus_files
    if not options_dict['forceFlag']:
        if not utils.target_needs_rebuild(source_files, snpma_file_path):
            verbose_print("SNP matrix %s has already been freshly built.  Use the -f option to force a rebuild." % snpma_file_path)
            verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
            return

    #==========================================================================
    #   Create snp matrix. Write results to file.
    #==========================================================================
    with open(snpma_file_path, "w") as output_file:
        for consensus_file_path in consensus_files:
            verbose_print("Merging " + consensus_file_path)
            with open(consensus_file_path, "r") as input_file:
                for line in input_file:
                    output_file.write(line)

    verbose_print("")
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Beispiel #5
0
def call_consensus(options_dict):
    """Call the consensus base for a sample

    Call the consensus base for a sample at the positions where SNPs were found
    in any of the samples.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            snplist.txt
            samples
                sample_name_one/reads.all.pileup
                sample_name_one/consensus.fasta (*)

    The files are used as follows:
        1. The snplist.txt input file contains the list of SNP positions
           extracted from all the var.flt.vcf files combined.
        2. The reads.all.pileup input file is a pileups at all positions
           used to determine the nucleotide base at each SNP position.
        3. The consensus.fasta output file contains the SNP calls for each
           sequence, arranged as a fasta file with one sequence per sample.

    The snplist.txt, and reads.snp.pileup are created outside of this function.
       The package documentation provides an example
        of creating these files based on the lambda_virus sequence that is used
        as one test for this package.

    Args:
        forceFlag : boolean
            flag to force processing even when result file already exists and
            is newer than inputs
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        allPileupFile : str
            Relative or absolute path to the genome-wide pileup file for this
            sample
        consensusFile : str
            Output file. Relative or absolute path to the consensus fasta file
            for this sample.
        minBaseQual : int
            Mimimum base quality score to count a read. All other snp filters
            take effect after the low-quality reads are discarded.
        minConsFreq : float
            Consensus frequency. Mimimum fraction of high-quality reads
            supporting the consensus to make a call.
        minConsStrdDpth : int
            Consensus strand depth. Minimum number of high-quality reads
            supporting the consensus which must be present on both the
            forward and reverse strands to make a call.
        minConsStrdBias : float
            Strand bias. Minimum fraction of the high-quality
            consensus-supporting reads which must be present on both the
            forward and reverse strands to make a call. The numerator of this
            fraction is the number of high-quality consensus-supporting reads
            on one strand.  The denominator is the total number of high-quality
            consensus-supporting reads on both strands combined.

    Raises:

    Examples:
    options_dict = {'snpListFile':'snplist.txt',
                    'allPileupFile':'reads.all.pileup',
                    'consensusFile':'consensus.fasta',
                    'minBaseQual':15,
                    'minConsFreq':0.6,
                    'minConsStrdDpth':4,
                    'minConsStrdBias':0.10,
                    'vcfFailedSnpGt':'.'
                   }
    call_consensus(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    snp_list_file_path = options_dict['snpListFile']
    all_pileup_file_path = options_dict['allPileupFile']
    sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path))
    sample_name = os.path.basename(sample_directory)
    consensus_file_path = options_dict['consensusFile']
    consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path))
    vcf_file_name = options_dict['vcfFileName']
    vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None

    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot call consensus without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path])
    if bad_file_count > 0:
        utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False)

    # Check if the result is already fresh
    source_files = [snp_list_file_path, all_pileup_file_path]
    if not options_dict['forceFlag'] and not utils.target_needs_rebuild(source_files, consensus_file_path):
        verbose_print("Consensus call file %s has already been freshly built.  Use the -f option to force a rebuild." % consensus_file_path)
        verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
        return

    # Load the list of which positions to called
    snp_list = utils.read_snp_position_list(snp_list_file_path)
    snplist_length = len(snp_list)
    verbose_print("snp position list length = %d" % snplist_length)

    # Call consensus. Write results to file.
    position_consensus_base_dict = dict()

    caller = pileup.ConsensusCaller(options_dict['minConsFreq'],
                                    options_dict['minConsStrdDpth'],
                                    options_dict['minConsStrdBias'])
    snp_positions = set(snp_list)
    parse_positions = None if options_dict['vcfAllPos'] else snp_positions
    pileup_reader = pileup.Reader(all_pileup_file_path,
                                  options_dict['minBaseQual'],
                                  parse_positions)
    if vcf_file_name:
        writer = vcf_writer.SingleSampleWriter(vcf_file_path, options_dict['vcfPreserveRefCase'])
        filters = caller.get_filter_descriptions()
        writer.write_header(sample_name, filters, options_dict['vcfRefName'])
    for pileup_record in pileup_reader:
        chrom = pileup_record.chrom
        pos = pileup_record.position
        consensus_base, fail_reasons = caller.call_consensus(pileup_record)
        if (chrom, pos) in snp_positions:
            if fail_reasons:
                position_consensus_base_dict[(chrom, pos)] = '-'
            else:
                position_consensus_base_dict[(chrom, pos)] = consensus_base

        if vcf_file_name:
            writer.write_from_pileup(pileup_record, fail_reasons, options_dict['vcfFailedSnpGt'])
    if vcf_file_name:
        writer.close()

    verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict)))

    consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list]
    consensus_str = ''.join(consensus_list)
    snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="")

    # Write the consensus calls to a fasta file
    with open(consensus_file_path, "w") as fasta_file_object:
        SeqIO.write([snp_seq_record], fasta_file_object, "fasta")

    verbose_print("")
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def call_consensus(args):
    """Call the consensus base for a sample

    Call the consensus base for a sample at the positions where SNPs were found
    in any of the samples.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            snplist.txt
            samples
                sample_name_one/reads.all.pileup
                sample_name_one/consensus.fasta (*)

    The files are used as follows:
        1. The snplist.txt input file contains the list of SNP positions
           extracted from all the var.flt.vcf files combined.
        2. The reads.all.pileup input file is a pileups at all positions
           used to determine the nucleotide base at each SNP position.
        3. The consensus.fasta output file contains the SNP calls for each
           sequence, arranged as a fasta file with one sequence per sample.

    The snplist.txt, and reads.all.pileup are created outside of this function.
       The package documentation provides an example
        of creating these files based on the lambda_virus sequence that is used
        as one test for this package.

    Parameters
    ----------
    args : namespace
        forceFlag : boolean
            flag to force processing even when result file already exists and
            is newer than inputs
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        excludeFile : str
            File path of VCF file of positions to exclude from the snp matrix.
        allPileupFile : str
            Relative or absolute path to the genome-wide pileup file for this
            sample
        consensusFile : str
            Output file. Relative or absolute path to the consensus fasta file
            for this sample.
        minBaseQual : int
            Mimimum base quality score to count a read. All other snp filters
            take effect after the low-quality reads are discarded.
        minConsFreq : float
            Consensus frequency. Mimimum fraction of high-quality reads
            supporting the consensus to make a call.
        minConsStrdDpth : int
            Consensus strand depth. Minimum number of high-quality reads
            supporting the consensus which must be present on both the
            forward and reverse strands to make a call.
        minConsStrdBias : float
            Strand bias. Minimum fraction of the high-quality
            consensus-supporting reads which must be present on both the
            forward and reverse strands to make a call. The numerator of this
            fraction is the number of high-quality consensus-supporting reads
            on one strand.  The denominator is the total number of high-quality
            consensus-supporting reads on both strands combined.

    Raises:

    Examples:
    args = argparse.Namespace
    args.snpListFile = 'snplist.txt'
    args.allPileupFile = 'reads.all.pileup'
    args.consensusFile = 'consensus.fasta'
    args.minBaseQual = 15
    args.minConsFreq = 0.6
    args.minConsStrdDpth = 4
    args.minConsStrdBias = 0.10
    args.vcfFailedSnpGt = '.'
    call_consensus(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    snp_list_file_path = args.snpListFile
    all_pileup_file_path = args.allPileupFile
    sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path))
    sample_name = os.path.basename(sample_directory)
    consensus_file_path = args.consensusFile
    consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path))
    vcf_file_name = args.vcfFileName
    vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None

    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot call consensus without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path])
    if bad_file_count > 0:
        utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False)

    source_files = [snp_list_file_path, all_pileup_file_path]

    exclude_file_path = args.excludeFile
    if exclude_file_path:
        bad_file_count = utils.verify_existing_input_files("Exclude file", [exclude_file_path])
        if bad_file_count > 0:
            utils.sample_error("Error: cannot call consensus without the file of excluded positions.", continue_possible=False)
        excluded_positions = utils.convert_vcf_file_to_snp_set(exclude_file_path)
        source_files.append(exclude_file_path)
    else:
        excluded_positions = set()

    # Check if the result is already fresh
    if not args.forceFlag and not utils.target_needs_rebuild(source_files, consensus_file_path):
        utils.verbose_print("Consensus call file %s has already been freshly built.  Use the -f option to force a rebuild." % consensus_file_path)
        return

    # Load the list of which positions to called
    snp_list = utils.read_snp_position_list(snp_list_file_path)
    snplist_length = len(snp_list)
    utils.verbose_print("snp position list length = %d" % snplist_length)
    utils.verbose_print("excluded snps list length = %d" % len(excluded_positions))
    utils.verbose_print("total snp position list length = %d" % (snplist_length + len(excluded_positions)))

    # Call consensus. Write results to file.
    position_consensus_base_dict = dict()

    caller = pileup.ConsensusCaller(args.minConsFreq,
                                    args.minConsStrdDpth,
                                    args.minConsStrdBias)

    snp_positions = set(snp_list)
    if args.vcfAllPos:
        parse_positions = None
    else:
        parse_positions = snp_positions.union(excluded_positions)
    pileup_reader = pileup.Reader(all_pileup_file_path,
                                  args.minBaseQual,
                                  parse_positions)
    if vcf_file_name:
        writer = vcf_writer.SingleSampleWriter(vcf_file_path, args.vcfPreserveRefCase)
        filters = caller.get_filter_descriptions()
        # TODO: it would be better if the exclude file contained filter headers we could read and re-use here instead of hard-coding this
        filters.append(("Region", "Position is in dense region of snps or near the end of the contig."))
        writer.write_header(sample_name, filters, args.vcfRefName)
    for pileup_record in pileup_reader:
        chrom = pileup_record.chrom
        pos = pileup_record.position
        consensus_base, fail_reasons = caller.call_consensus(pileup_record)
        if (chrom, pos) in excluded_positions:
            # TODO: it would be better if the exclude file contained filter reasons we could re-use here instead of hard coding this
            fail_reasons = fail_reasons or []
            fail_reasons.append("Region")
        if (chrom, pos) in snp_positions:
            if fail_reasons:
                position_consensus_base_dict[(chrom, pos)] = '-'
            else:
                position_consensus_base_dict[(chrom, pos)] = consensus_base

        if vcf_file_name:
            writer.write_from_pileup(pileup_record, fail_reasons, args.vcfFailedSnpGt)
    if vcf_file_name:
        writer.close()

    utils.verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict)))

    consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list]
    consensus_str = ''.join(consensus_list)
    snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="")

    # Write the consensus calls to a fasta file
    with open(consensus_file_path, "w") as fasta_file_object:
        SeqIO.write([snp_seq_record], fasta_file_object, "fasta")
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate some parameters
    #==========================================================================
    edge_length = args.edgeLength
    window_size = args.windowSize
    max_num_snp = args.maxSNP

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    input_file_list = list()
    out_group_list_path = args.outGroupFile
    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            #There are outgroup samples
            input_file_list.append(out_group_list_path)
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file]
            sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples)
        except:
            utils.global_error("Error: Cannot open the file containing the list of outgroup samples!")

    #==========================================================================
    # Validate inputs
    #==========================================================================
    vcf_file_name = args.vcfFileName
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]
    input_file_list.extend(list_of_vcf_files)

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [args.refFastaFile])
    if bad_file_count > 0:
        utils.global_error(None)

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(args.refFastaFile, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            #build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
        input_file_list.append(args.refFastaFile)
    except:
        utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.")
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild for all samples because
    # the bad regions are combined across all samples.  However, a missing
    # output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        preserved_needs_rebuild = utils.target_needs_rebuild(input_file_list, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(input_file_list, removed_vcf_file_path)
        need_rebuild_dict[vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print("All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild.")
        return

    #==========================================================================
    # Find all bad regions.
    #==========================================================================
    bad_regions_dict = dict() # Key is the contig ID, and the value is a list of bad regions.
    for vcf_file_path in list_of_vcf_files:
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
            continue

        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID in sorted_list_of_outgroup_samples:
            if not need_rebuild_dict[vcf_file_path]:
                vcf_reader_handle.close()
                continue
            #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
                continue

            vcf_writer_removed.close()
            vcf_reader_handle.close()
            shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
        else:
            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.
            snp_dict = defaultdict(list)
            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                record = (vcf_data_line.POS, vcf_data_line)
                snp_dict[vcf_data_line.CHROM].append(record)

            #Find bad regions and add them into bad_region
            for contig, snp_list in snp_dict.items():

                #sort all SNPs in this contig by position
                sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0])

                #total number of SNPs
                num_of_snp = len(sorted_list)

                if contig not in bad_regions_dict:
                    #New contig
                    try:
                        contig_length = contig_length_dict[contig]
                    except:
                        #cannot find contig length. Use the sys.maxsize.
                        contig_length = sys.maxsize

                    if (contig_length <= (edge_length * 2)):
                        bad_regions_dict[contig] = [(0, contig_length)]
                    else:
                        region = [(0, edge_length), (contig_length - edge_length, contig_length)]
                        bad_regions_dict[contig] = region

                #Process SNPs
                for idx, snp in enumerate(sorted_list):
                    if (idx + max_num_snp) < num_of_snp:
                        pos_start = snp[0]
                        pos_end = sorted_list[idx + max_num_snp][0]
                        if (pos_start + window_size) >= pos_end:
                            #Add bad region
                            regions = bad_regions_dict[contig]
                            temp_region = (pos_start, pos_end)
                            regions.append(temp_region)
        vcf_reader_handle.close()

    #Combine all bad regions for each contig
    for contig, regions in bad_regions_dict.items():
        sorted_regions = utils.sort_coord(regions)
        combined_regions = utils.consensus(sorted_regions)
        bad_regions_dict[contig] = combined_regions

    #Scan vcf files to remove SNPs
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID not in sorted_list_of_outgroup_samples:
            try:
                vcf_reader_handle = open(vcf_file_path, 'r')
                vcf_reader = vcf.Reader(vcf_reader_handle)
            except:
                utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
                continue

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_preserved = None
                vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader)
            except:
                if vcf_writer_preserved is not None:
                    vcf_writer_preserved.close()
                os.remove(preserved_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True)
                continue

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_writer_preserved.close()
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
                continue

            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                contig = vcf_data_line.CHROM
                if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]):
                    #Remove this SNP
                    vcf_writer_removed.write_record(vcf_data_line)
                else:
                    #Preserve this SNP
                    vcf_writer_preserved.write_record(vcf_data_line)

            vcf_writer_preserved.close()
            vcf_writer_removed.close()
            vcf_reader_handle.close()
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.
        acrossSamples: Dense regions found in any sample are filtered from all samples.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Get arguments from Argparse namespace
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    ref_fasta_path = args.refFastaFile
    force_flag = args.forceFlag
    vcf_file_name = args.vcfFileName
    edge_length = args.edgeLength
    window_size_list = args.windowSizeList
    max_num_snps_list = args.maxSnpsList
    out_group_list_path = args.outGroupFile
    filter_across_samples = args.acrossSamples

    #==========================================================================
    # Validate inputs
    #==========================================================================
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]
    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [ref_fasta_path])
    if bad_file_count > 0:
        utils.global_error(None)

    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            # There are outgroup samples
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file]
            sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples)
        except:
            utils.global_error("Error: Cannot open the file containing the list of outgroup samples!")

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(ref_fasta_path, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            # build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
    except:
        utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.")
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Filter regions
    #==========================================================================
    if filter_across_samples:
        filter_regions_across_samples(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
    else:
        filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
def write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict):
    """Given a VCF file and a collection of abnormal regions, scan the snps in
    the VCF file and write each snps to either the preserved or removed output VCF file.

    Parameters
    ----------
    vcf_file_path : str
        Path to a sample VCF file.
    bad_regions_dict : dict
        Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position).
    """
    try:
        vcf_reader_handle = open(vcf_file_path, 'r')
        vcf_reader = vcf.Reader(vcf_reader_handle)
    except:
        utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
        return

    # SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

    preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
    removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

    try:
        vcf_writer_preserved = None
        vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader)
    except:
        if vcf_writer_preserved is not None:
            vcf_writer_preserved.close()
        os.remove(preserved_vcf_file_path)
        vcf_reader_handle.close()
        utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True)
        return

    try:
        vcf_writer_removed = None
        vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
    except:
        # close vcf_writer_reserved and remove the file reserved_vcf_file_path
        if vcf_writer_removed is not None:
            vcf_writer_removed.close()
        os.remove(removed_vcf_file_path)
        vcf_writer_preserved.close()
        vcf_reader_handle.close()
        utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
        return

    for vcf_data_line in vcf_reader:
        # Create a dict to store all SNPs in this sample
        # get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
        contig = vcf_data_line.CHROM
        if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]):
            # Remove this SNP
            vcf_writer_removed.write_record(vcf_data_line)
        else:
            # Preserve this SNP
            vcf_writer_preserved.write_record(vcf_data_line)

    vcf_writer_preserved.close()
    vcf_writer_removed.close()
    vcf_reader_handle.close()
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path):
    """Detect abnormal regions in each sample and filter those regions from all samples.

    Parameters
    ----------
    list_of_vcf_files : list of str
        List of input VCF file paths -- one per sample.
    contig_length_dict : dict, str --> int
        Mapping of contig id to int length of contig.
    sorted_list_of_outgroup_samples : list of str
        List of sample IDs for samples that are outgroup samples.
    force_flag : bool
        Force processing even when result files already exist and are newer than inputs.
    edge_length : int
        The length of the edge regions in a contig, in which all SNPs will be removed.
    window_size_list : list of int
        The length of the window in which the number of SNPs should be no more than max_num_snp.
    max_num_snps_list : list of int
        The maximum number of SNPs allowed in a window.  This list has the same size as window_size_list
        and the entries correspond to one another.
    ref_fasta_path : str
        Path to the reference fasta file.
    out_group_list_path : str
        Path to the file indicating outgroup samples, one sample ID per line.
    """
    #==========================================================================
    # Prep work
    #==========================================================================
    input_file_list = list()
    input_file_list.append(ref_fasta_path)
    if out_group_list_path:
        input_file_list.append(out_group_list_path)

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild for all samples because
    # the bad regions are combined across all samples.  However, a missing
    # output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        input_files = input_file_list + [vcf_file_path]
        preserved_needs_rebuild = utils.target_needs_rebuild(input_files, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(input_files, removed_vcf_file_path)
        need_rebuild_dict[vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print("All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild.")
        return

    #==========================================================================
    # Find all bad regions in one sample at a time
    #==========================================================================
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
            continue

        sample_ID = utils.sample_id_from_file(vcf_file_path)
        utils.verbose_print("Processing sample %s" % sample_ID)
        if sample_ID in sorted_list_of_outgroup_samples:
            write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader)
        else:
            # The bad_regions_dict holds the bad regions for this sample
            # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position).
            bad_regions_dict = dict()
            collect_dense_regions(vcf_reader, bad_regions_dict, contig_length_dict, edge_length, max_num_snps_list, window_size_list)

            # Combine all bad regions for each contig
            for contig, regions in bad_regions_dict.items():
                combined_regions = utils.merge_regions(regions)
                bad_regions_dict[contig] = combined_regions

            # Write the output files
            write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict)

        vcf_reader_handle.close()
Beispiel #11
0
def create_snp_list(options_dict):
    """Create SNP list file

    Description:
    Create the SNP list -- the list of positions where variants were found
    and the corresponding list of samples having a variant at each position. 
    This function expects, or creates '(*)', the following files arranged 
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                ...
            snplist.txt (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to 
           the sample directories.
        2. The var.flt.vcf variant input files are used to construct the 
           SNP position list.
        3. The snplist.txt output file contains the union of the SNP positions 
           and sample names extracted from all the var.flt.vcf files.

    The sampleDirectories.txt and var.flt.vcf files are created outside of 
    this function. The package documentation provides an example of creating 
    these files based on the lambda_virus sequence that is used as one test 
    for this package.

    Args:
        sampleDirsFile: File path (not just file name) of file containing paths 
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        snpListFile: File path (not just file name) of text format list 
            of SNP positions

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'vcfFileName':'var.flt.vcf'
                    'snpListFile':'snplist.txt',
                   }
    create_snp_list(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_filename = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_filename, "r") as sample_directories_list_file:
        list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    list_of_sample_directories = sorted([d for d in list_of_sample_directories if d])

    #==========================================================================
    # Read in all vcf files and process into dict of SNPs passing various
    # criteria. Do this for each sample. Write to file.
    #==========================================================================
    snp_list_file_path = options_dict['snpListFile']
    vcf_file_name = options_dict['vcfFileName']
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in list_of_sample_directories]

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path):
        snp_dict = utils.convert_vcf_files_to_snp_dict(list_of_vcf_files)
        verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files)))
        utils.write_list_of_snps(snp_list_file_path, snp_dict)
        verbose_print("")
    else:
        verbose_print("SNP list %s has already been freshly built.  Use the -f option to force a rebuild." % snp_list_file_path)
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Beispiel #12
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.
        mode:
            all = Dense regions found in any sample are filtered from all samples.
            each = Dense regions found in any sample are filtered independently from samples.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Get arguments from Argparse namespace
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    ref_fasta_path = args.refFastaFile
    force_flag = args.forceFlag
    vcf_file_name = args.vcfFileName
    edge_length = args.edgeLength
    window_size_list = args.windowSizeList
    max_num_snps_list = args.maxSnpsList
    out_group_list_path = args.outGroupFile
    filter_across_samples = args.mode == "all"

    #==========================================================================
    # Validate inputs
    #==========================================================================
    bad_file_count = utils.verify_non_empty_input_files(
        "File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path,
              "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [
            line.rstrip() for line in sample_directories_list_file
        ]
    unsorted_list_of_sample_directories = [
        d for d in unsorted_list_of_sample_directories if d
    ]
    sorted_list_of_sample_directories = sorted(
        unsorted_list_of_sample_directories)

    list_of_vcf_files = [
        os.path.join(dir, vcf_file_name)
        for dir in sorted_list_of_sample_directories
    ]
    bad_file_count = utils.verify_non_empty_input_files(
        "VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." %
                           bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." %
                           bad_file_count,
                           continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files(
        "Reference file", [ref_fasta_path])
    if bad_file_count > 0:
        utils.global_error(None)

    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files(
            "File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            # There are outgroup samples
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [
                    line.rstrip() for line in out_group_list_file
                ]
            sorted_list_of_outgroup_samples = sorted(
                unsorted_list_of_outgroup_samples)
        except:
            utils.global_error(
                "Error: Cannot open the file containing the list of outgroup samples!"
            )

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(ref_fasta_path, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            # build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
    except:
        utils.global_error(
            "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file."
        )
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Filter regions
    #==========================================================================
    if filter_across_samples:
        filter_regions_across_samples(list_of_vcf_files, contig_length_dict,
                                      sorted_list_of_outgroup_samples,
                                      force_flag, edge_length,
                                      window_size_list, max_num_snps_list,
                                      ref_fasta_path, out_group_list_path)
    else:
        filter_regions_per_sample(list_of_vcf_files, contig_length_dict,
                                  sorted_list_of_outgroup_samples, force_flag,
                                  edge_length, window_size_list,
                                  max_num_snps_list, ref_fasta_path,
                                  out_group_list_path)
Beispiel #13
0
def write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict):
    """Given a VCF file and a collection of abnormal regions, scan the snps in
    the VCF file and write each snps to either the preserved or removed output VCF file.

    Parameters
    ----------
    vcf_file_path : str
        Path to a sample VCF file.
    bad_regions_dict : dict
        Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position).
    """
    try:
        vcf_reader_handle = open(vcf_file_path, 'r')
        vcf_reader = vcf.Reader(vcf_reader_handle)
    except:
        utils.sample_error("Error: Cannot open the input vcf file: %s." %
                           vcf_file_path,
                           continue_possible=True)
        return

    # SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

    preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
    removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

    try:
        vcf_writer_preserved = None
        vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'),
                                          vcf_reader)
    except:
        if vcf_writer_preserved is not None:
            vcf_writer_preserved.close()
        os.remove(preserved_vcf_file_path)
        vcf_reader_handle.close()
        utils.sample_error(
            "Error: Cannot create the file for preserved SNPs: %s." %
            preserved_vcf_file_path,
            continue_possible=True)
        return

    try:
        vcf_writer_removed = None
        vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'),
                                        vcf_reader)
    except:
        # close vcf_writer_reserved and remove the file reserved_vcf_file_path
        if vcf_writer_removed is not None:
            vcf_writer_removed.close()
        os.remove(removed_vcf_file_path)
        vcf_writer_preserved.close()
        vcf_reader_handle.close()
        utils.sample_error(
            "Error: Cannot create the file for removed SNPs: %s." %
            removed_vcf_file_path,
            continue_possible=True)
        return

    for vcf_data_line in vcf_reader:
        # Create a dict to store all SNPs in this sample
        # get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
        contig = vcf_data_line.CHROM
        if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]):
            # Remove this SNP
            vcf_writer_removed.write_record(vcf_data_line)
        else:
            # Preserve this SNP
            vcf_writer_preserved.write_record(vcf_data_line)

    vcf_writer_preserved.close()
    vcf_writer_removed.close()
    vcf_reader_handle.close()
Beispiel #14
0
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict,
                              sorted_list_of_outgroup_samples, force_flag,
                              edge_length, window_size_list, max_num_snps_list,
                              ref_fasta_path, out_group_list_path):
    """Detect abnormal regions in each sample and filter those regions from all samples.

    Parameters
    ----------
    list_of_vcf_files : list of str
        List of input VCF file paths -- one per sample.
    contig_length_dict : dict, str --> int
        Mapping of contig id to int length of contig.
    sorted_list_of_outgroup_samples : list of str
        List of sample IDs for samples that are outgroup samples.
    force_flag : bool
        Force processing even when result files already exist and are newer than inputs.
    edge_length : int
        The length of the edge regions in a contig, in which all SNPs will be removed.
    window_size_list : list of int
        The length of the window in which the number of SNPs should be no more than max_num_snp.
    max_num_snps_list : list of int
        The maximum number of SNPs allowed in a window.  This list has the same size as window_size_list
        and the entries correspond to one another.
    ref_fasta_path : str
        Path to the reference fasta file.
    out_group_list_path : str
        Path to the file indicating outgroup samples, one sample ID per line.
    """
    #==========================================================================
    # Prep work
    #==========================================================================
    input_file_list = list()
    input_file_list.append(ref_fasta_path)
    if out_group_list_path:
        input_file_list.append(out_group_list_path)

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild only for that sample.
    # A missing output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        input_files = input_file_list + [vcf_file_path]
        preserved_needs_rebuild = utils.target_needs_rebuild(
            input_files, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(
            input_files, removed_vcf_file_path)
        need_rebuild_dict[
            vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print(
            "All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild."
        )
        return

    #==========================================================================
    # Find all bad regions in one sample at a time
    #==========================================================================
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." %
                               vcf_file_path,
                               continue_possible=True)
            continue

        sample_ID = utils.sample_id_from_file(vcf_file_path)
        utils.verbose_print("Processing sample %s" % sample_ID)
        if sample_ID in sorted_list_of_outgroup_samples:
            write_outgroup_preserved_and_removed_vcf_files(
                vcf_file_path, vcf_reader)
        else:
            # The bad_regions_dict holds the bad regions for this sample
            # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position).
            bad_regions_dict = dict()
            collect_dense_regions(vcf_reader, bad_regions_dict,
                                  contig_length_dict, edge_length,
                                  max_num_snps_list, window_size_list)

            # Combine all bad regions for each contig
            for contig, regions in bad_regions_dict.items():
                combined_regions = utils.merge_regions(regions)
                bad_regions_dict[contig] = combined_regions

            # Write the output files
            write_preserved_and_removed_vcf_files(vcf_file_path,
                                                  bad_regions_dict)

        vcf_reader_handle.close()
Beispiel #15
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate some parameters
    #==========================================================================
    edge_length = args.edgeLength
    window_size = args.windowSize
    max_num_snp = args.maxSNP

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    bad_file_count = utils.verify_non_empty_input_files(
        "File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path,
              "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [
            line.rstrip() for line in sample_directories_list_file
        ]
    unsorted_list_of_sample_directories = [
        d for d in unsorted_list_of_sample_directories if d
    ]
    sorted_list_of_sample_directories = sorted(
        unsorted_list_of_sample_directories)

    input_file_list = list()
    out_group_list_path = args.outGroupFile
    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files(
            "File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            #There are outgroup samples
            input_file_list.append(out_group_list_path)
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [
                    line.rstrip() for line in out_group_list_file
                ]
            sorted_list_of_outgroup_samples = sorted(
                unsorted_list_of_outgroup_samples)
        except:
            utils.global_error(
                "Error: Cannot open the file containing the list of outgroup samples!"
            )

    #==========================================================================
    # Validate inputs
    #==========================================================================
    vcf_file_name = args.vcfFileName
    list_of_vcf_files = [
        os.path.join(dir, vcf_file_name)
        for dir in sorted_list_of_sample_directories
    ]
    input_file_list.extend(list_of_vcf_files)

    bad_file_count = utils.verify_non_empty_input_files(
        "VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." %
                           bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." %
                           bad_file_count,
                           continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files(
        "Reference file", [args.refFastaFile])
    if bad_file_count > 0:
        utils.global_error(None)

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(args.refFastaFile, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            #build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
        input_file_list.append(args.refFastaFile)
    except:
        utils.global_error(
            "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file."
        )
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild for all samples because
    # the bad regions are combined across all samples.  However, a missing
    # output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        preserved_needs_rebuild = utils.target_needs_rebuild(
            input_file_list, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(
            input_file_list, removed_vcf_file_path)
        need_rebuild_dict[
            vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print(
            "All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild."
        )
        return

    #==========================================================================
    # Find all bad regions.
    #==========================================================================
    bad_regions_dict = dict(
    )  # Key is the contig ID, and the value is a list of bad regions.
    for vcf_file_path in list_of_vcf_files:
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." %
                               vcf_file_path,
                               continue_possible=True)
            continue

        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID in sorted_list_of_outgroup_samples:
            if not need_rebuild_dict[vcf_file_path]:
                vcf_reader_handle.close()
                continue
            #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(
                    open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for removed SNPs: %s." %
                    removed_vcf_file_path,
                    continue_possible=True)
                continue

            vcf_writer_removed.close()
            vcf_reader_handle.close()
            shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
        else:
            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.
            snp_dict = defaultdict(list)
            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                record = (vcf_data_line.POS, vcf_data_line)
                snp_dict[vcf_data_line.CHROM].append(record)

            #Find bad regions and add them into bad_region
            for contig, snp_list in snp_dict.items():

                #sort all SNPs in this contig by position
                sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0])

                #total number of SNPs
                num_of_snp = len(sorted_list)

                if contig not in bad_regions_dict:
                    #New contig
                    try:
                        contig_length = contig_length_dict[contig]
                    except:
                        #cannot find contig length. Use the sys.maxsize.
                        contig_length = sys.maxsize

                    if (contig_length <= (edge_length * 2)):
                        bad_regions_dict[contig] = [(0, contig_length)]
                    else:
                        region = [(0, edge_length),
                                  (contig_length - edge_length, contig_length)]
                        bad_regions_dict[contig] = region

                #Process SNPs
                for idx, snp in enumerate(sorted_list):
                    if (idx + max_num_snp) < num_of_snp:
                        pos_start = snp[0]
                        pos_end = sorted_list[idx + max_num_snp][0]
                        if (pos_start + window_size) >= pos_end:
                            #Add bad region
                            regions = bad_regions_dict[contig]
                            temp_region = (pos_start, pos_end)
                            regions.append(temp_region)
        vcf_reader_handle.close()

    #Combine all bad regions for each contig
    for contig, regions in bad_regions_dict.items():
        sorted_regions = utils.sort_coord(regions)
        combined_regions = utils.consensus(sorted_regions)
        bad_regions_dict[contig] = combined_regions

    #Scan vcf files to remove SNPs
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID not in sorted_list_of_outgroup_samples:
            try:
                vcf_reader_handle = open(vcf_file_path, 'r')
                vcf_reader = vcf.Reader(vcf_reader_handle)
            except:
                utils.sample_error(
                    "Error: Cannot open the input vcf file: %s." %
                    vcf_file_path,
                    continue_possible=True)
                continue

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_preserved = None
                vcf_writer_preserved = vcf.Writer(
                    open(preserved_vcf_file_path, 'w'), vcf_reader)
            except:
                if vcf_writer_preserved is not None:
                    vcf_writer_preserved.close()
                os.remove(preserved_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for preserved SNPs: %s." %
                    preserved_vcf_file_path,
                    continue_possible=True)
                continue

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(
                    open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_writer_preserved.close()
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for removed SNPs: %s." %
                    removed_vcf_file_path,
                    continue_possible=True)
                continue

            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                contig = vcf_data_line.CHROM
                if utils.in_region(vcf_data_line.POS,
                                   bad_regions_dict[contig]):
                    #Remove this SNP
                    vcf_writer_removed.write_record(vcf_data_line)
                else:
                    #Preserve this SNP
                    vcf_writer_preserved.write_record(vcf_data_line)

            vcf_writer_preserved.close()
            vcf_writer_removed.close()
            vcf_reader_handle.close()