Ejemplo n.º 1
0
def index_ref(args):
    """Index the reference genome.

    Execute an external program (bowtie2 or smalt) to create an index for the
    reference genome to be used during subsequent alignment.  Execute samtools
    to create the faidx index file to be used during subsequent pileups.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta         # input fasta
                referenceFile.#.bt2*        # bowtie2 output
                referenceFile.rev.#.bt2*    # bowtie2 output
                referenceFile.sma*          # smalt output
                referenceFile.smi*          # smalt output
                referenceFile.fasta.fai*    # samtools faidx output

    The input fasta file is created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    # Create index file for reference
    if snp_pipeline_aligner == "bowtie2":
        target_file = reference_base_path + ".rev.1.bt2"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bowtie index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")
            bowtie2_build_extra_params = os.environ.get(
                "Bowtie2Build_ExtraParams") or ""
            command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)
            utils.global_error_on_missing_file(target_file, "bowtie2-build")

    elif snp_pipeline_aligner == "smalt":
        target_file = reference_base_path + ".smi"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Smalt index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("smalt", "smalt version")
            smalt_index_extra_params = os.environ.get(
                "SmaltIndex_ExtraParams") or ""
            command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    # Create the samtools fai index
    verbose_print("")
    target_file = reference_file_path + ".fai"
    needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                               target_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# SAMtools fai index %s is already freshly built.  Use the -f option to force a rebuild."
            % target_file)
    else:
        version_str = utils.extract_version_str("samtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_faidx_extra_params = os.environ.get(
            "SamtoolsFaidx_ExtraParams") or ""
        command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.global_error_on_missing_file(target_file, "samtools faidx")

    # Create the reference dict file used later by GATK
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"
    if enable_local_realignment:
        verbose_print("")
        target_file = reference_base_path + ".dict"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Sequence dictionary %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            utils.remove_file(
                target_file
            )  # Need to delete existing output, if any, before running
            jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
            if not jar_file_path:
                utils.global_error(
                    "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable."
                )
            version_str = utils.extract_version_str(
                "Picard", "java -jar " + jar_file_path +
                " CreateSequenceDictionary --version 2>&1")
            picard_jvm_extra_params = os.environ.get(
                "PicardJvm_ExtraParams") or ""
            picard_create_sequence_dictionary_extra_params = os.environ.get(
                "CreateSequenceDictionary_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + jar_file_path + " CreateSequenceDictionary REFERENCE=" + reference_file_path + " OUTPUT=" + target_file + tmp_option + ' ' + picard_create_sequence_dictionary_extra_params
            verbose_print("# Create reference sequence dictionary.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(
                target_file, "picard CreateSequenceDictionary")
Ejemplo n.º 2
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate some parameters
    #==========================================================================
    edge_length = args.edgeLength
    window_size = args.windowSize
    max_num_snp = args.maxSNP

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    bad_file_count = utils.verify_non_empty_input_files(
        "File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path,
              "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [
            line.rstrip() for line in sample_directories_list_file
        ]
    unsorted_list_of_sample_directories = [
        d for d in unsorted_list_of_sample_directories if d
    ]
    sorted_list_of_sample_directories = sorted(
        unsorted_list_of_sample_directories)

    input_file_list = list()
    out_group_list_path = args.outGroupFile
    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files(
            "File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            #There are outgroup samples
            input_file_list.append(out_group_list_path)
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [
                    line.rstrip() for line in out_group_list_file
                ]
            sorted_list_of_outgroup_samples = sorted(
                unsorted_list_of_outgroup_samples)
        except:
            utils.global_error(
                "Error: Cannot open the file containing the list of outgroup samples!"
            )

    #==========================================================================
    # Validate inputs
    #==========================================================================
    vcf_file_name = args.vcfFileName
    list_of_vcf_files = [
        os.path.join(dir, vcf_file_name)
        for dir in sorted_list_of_sample_directories
    ]
    input_file_list.extend(list_of_vcf_files)

    bad_file_count = utils.verify_non_empty_input_files(
        "VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." %
                           bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." %
                           bad_file_count,
                           continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files(
        "Reference file", [args.refFastaFile])
    if bad_file_count > 0:
        utils.global_error(None)

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(args.refFastaFile, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            #build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
        input_file_list.append(args.refFastaFile)
    except:
        utils.global_error(
            "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file."
        )
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild for all samples because
    # the bad regions are combined across all samples.  However, a missing
    # output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        preserved_needs_rebuild = utils.target_needs_rebuild(
            input_file_list, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(
            input_file_list, removed_vcf_file_path)
        need_rebuild_dict[
            vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print(
            "All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild."
        )
        return

    #==========================================================================
    # Find all bad regions.
    #==========================================================================
    bad_regions_dict = dict(
    )  # Key is the contig ID, and the value is a list of bad regions.
    for vcf_file_path in list_of_vcf_files:
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." %
                               vcf_file_path,
                               continue_possible=True)
            continue

        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID in sorted_list_of_outgroup_samples:
            if not need_rebuild_dict[vcf_file_path]:
                vcf_reader_handle.close()
                continue
            #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(
                    open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for removed SNPs: %s." %
                    removed_vcf_file_path,
                    continue_possible=True)
                continue

            vcf_writer_removed.close()
            vcf_reader_handle.close()
            shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
        else:
            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.
            snp_dict = defaultdict(list)
            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                record = (vcf_data_line.POS, vcf_data_line)
                snp_dict[vcf_data_line.CHROM].append(record)

            #Find bad regions and add them into bad_region
            for contig, snp_list in snp_dict.items():

                #sort all SNPs in this contig by position
                sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0])

                #total number of SNPs
                num_of_snp = len(sorted_list)

                if contig not in bad_regions_dict:
                    #New contig
                    try:
                        contig_length = contig_length_dict[contig]
                    except:
                        #cannot find contig length. Use the sys.maxsize.
                        contig_length = sys.maxsize

                    if (contig_length <= (edge_length * 2)):
                        bad_regions_dict[contig] = [(0, contig_length)]
                    else:
                        region = [(0, edge_length),
                                  (contig_length - edge_length, contig_length)]
                        bad_regions_dict[contig] = region

                #Process SNPs
                for idx, snp in enumerate(sorted_list):
                    if (idx + max_num_snp) < num_of_snp:
                        pos_start = snp[0]
                        pos_end = sorted_list[idx + max_num_snp][0]
                        if (pos_start + window_size) >= pos_end:
                            #Add bad region
                            regions = bad_regions_dict[contig]
                            temp_region = (pos_start, pos_end)
                            regions.append(temp_region)
        vcf_reader_handle.close()

    #Combine all bad regions for each contig
    for contig, regions in bad_regions_dict.items():
        sorted_regions = utils.sort_coord(regions)
        combined_regions = utils.consensus(sorted_regions)
        bad_regions_dict[contig] = combined_regions

    #Scan vcf files to remove SNPs
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID not in sorted_list_of_outgroup_samples:
            try:
                vcf_reader_handle = open(vcf_file_path, 'r')
                vcf_reader = vcf.Reader(vcf_reader_handle)
            except:
                utils.sample_error(
                    "Error: Cannot open the input vcf file: %s." %
                    vcf_file_path,
                    continue_possible=True)
                continue

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_preserved = None
                vcf_writer_preserved = vcf.Writer(
                    open(preserved_vcf_file_path, 'w'), vcf_reader)
            except:
                if vcf_writer_preserved is not None:
                    vcf_writer_preserved.close()
                os.remove(preserved_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for preserved SNPs: %s." %
                    preserved_vcf_file_path,
                    continue_possible=True)
                continue

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(
                    open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_writer_preserved.close()
                vcf_reader_handle.close()
                utils.sample_error(
                    "Error: Cannot create the file for removed SNPs: %s." %
                    removed_vcf_file_path,
                    continue_possible=True)
                continue

            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                contig = vcf_data_line.CHROM
                if utils.in_region(vcf_data_line.POS,
                                   bad_regions_dict[contig]):
                    #Remove this SNP
                    vcf_writer_removed.write_record(vcf_data_line)
                else:
                    #Preserve this SNP
                    vcf_writer_preserved.write_record(vcf_data_line)

            vcf_writer_preserved.close()
            vcf_writer_removed.close()
            vcf_reader_handle.close()
Ejemplo n.º 3
0
def call_consensus(args):
    """Call the consensus base for a sample

    Call the consensus base for a sample at the positions where SNPs were found
    in any of the samples.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            snplist.txt
            samples
                sample_name_one/reads.all.pileup
                sample_name_one/consensus.fasta (*)

    The files are used as follows:
        1. The snplist.txt input file contains the list of SNP positions
           extracted from all the var.flt.vcf files combined.
        2. The reads.all.pileup input file is a pileups at all positions
           used to determine the nucleotide base at each SNP position.
        3. The consensus.fasta output file contains the SNP calls for each
           sequence, arranged as a fasta file with one sequence per sample.

    The snplist.txt, and reads.all.pileup are created outside of this function.
       The package documentation provides an example
        of creating these files based on the lambda_virus sequence that is used
        as one test for this package.

    Parameters
    ----------
    args : namespace
        forceFlag : boolean
            flag to force processing even when result file already exists and
            is newer than inputs
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        excludeFile : str
            File path of VCF file of positions to exclude from the snp matrix.
        allPileupFile : str
            Relative or absolute path to the genome-wide pileup file for this
            sample
        consensusFile : str
            Output file. Relative or absolute path to the consensus fasta file
            for this sample.
        minBaseQual : int
            Mimimum base quality score to count a read. All other snp filters
            take effect after the low-quality reads are discarded.
        minConsFreq : float
            Consensus frequency. Mimimum fraction of high-quality reads
            supporting the consensus to make a call.
        minConsStrdDpth : int
            Consensus strand depth. Minimum number of high-quality reads
            supporting the consensus which must be present on both the
            forward and reverse strands to make a call.
        minConsStrdBias : float
            Strand bias. Minimum fraction of the high-quality
            consensus-supporting reads which must be present on both the
            forward and reverse strands to make a call. The numerator of this
            fraction is the number of high-quality consensus-supporting reads
            on one strand.  The denominator is the total number of high-quality
            consensus-supporting reads on both strands combined.

    Raises:

    Examples:
    args = argparse.Namespace
    args.snpListFile = 'snplist.txt'
    args.allPileupFile = 'reads.all.pileup'
    args.consensusFile = 'consensus.fasta'
    args.minBaseQual = 15
    args.minConsFreq = 0.6
    args.minConsStrdDpth = 4
    args.minConsStrdBias = 0.10
    args.vcfFailedSnpGt = '.'
    call_consensus(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    snp_list_file_path = args.snpListFile
    all_pileup_file_path = args.allPileupFile
    sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path))
    sample_name = os.path.basename(sample_directory)
    consensus_file_path = args.consensusFile
    consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path))
    vcf_file_name = args.vcfFileName
    vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None

    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot call consensus without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path])
    if bad_file_count > 0:
        utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False)

    source_files = [snp_list_file_path, all_pileup_file_path]

    exclude_file_path = args.excludeFile
    if exclude_file_path:
        bad_file_count = utils.verify_existing_input_files("Exclude file", [exclude_file_path])
        if bad_file_count > 0:
            utils.sample_error("Error: cannot call consensus without the file of excluded positions.", continue_possible=False)
        excluded_positions = utils.convert_vcf_file_to_snp_set(exclude_file_path)
        source_files.append(exclude_file_path)
    else:
        excluded_positions = set()

    # Check if the result is already fresh
    if not args.forceFlag and not utils.target_needs_rebuild(source_files, consensus_file_path):
        utils.verbose_print("Consensus call file %s has already been freshly built.  Use the -f option to force a rebuild." % consensus_file_path)
        return

    # Load the list of which positions to called
    snp_list = utils.read_snp_position_list(snp_list_file_path)
    snplist_length = len(snp_list)
    utils.verbose_print("snp position list length = %d" % snplist_length)
    utils.verbose_print("excluded snps list length = %d" % len(excluded_positions))
    utils.verbose_print("total snp position list length = %d" % (snplist_length + len(excluded_positions)))

    # Call consensus. Write results to file.
    position_consensus_base_dict = dict()

    caller = pileup.ConsensusCaller(args.minConsFreq,
                                    args.minConsStrdDpth,
                                    args.minConsStrdBias)

    snp_positions = set(snp_list)
    if args.vcfAllPos:
        parse_positions = None
    else:
        parse_positions = snp_positions.union(excluded_positions)
    pileup_reader = pileup.Reader(all_pileup_file_path,
                                  args.minBaseQual,
                                  parse_positions)
    if vcf_file_name:
        writer = vcf_writer.SingleSampleWriter(vcf_file_path, args.vcfPreserveRefCase)
        filters = caller.get_filter_descriptions()
        # TODO: it would be better if the exclude file contained filter headers we could read and re-use here instead of hard-coding this
        filters.append(("Region", "Position is in dense region of snps or near the end of the contig."))
        writer.write_header(sample_name, filters, args.vcfRefName)
    for pileup_record in pileup_reader:
        chrom = pileup_record.chrom
        pos = pileup_record.position
        consensus_base, fail_reasons = caller.call_consensus(pileup_record)
        if (chrom, pos) in excluded_positions:
            # TODO: it would be better if the exclude file contained filter reasons we could re-use here instead of hard coding this
            fail_reasons = fail_reasons or []
            fail_reasons.append("Region")
        if (chrom, pos) in snp_positions:
            if fail_reasons:
                position_consensus_base_dict[(chrom, pos)] = '-'
            else:
                position_consensus_base_dict[(chrom, pos)] = consensus_base

        if vcf_file_name:
            writer.write_from_pileup(pileup_record, fail_reasons, args.vcfFailedSnpGt)
    if vcf_file_name:
        writer.close()

    utils.verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict)))

    consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list]
    consensus_str = ''.join(consensus_list)
    snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="")

    # Write the consensus calls to a fasta file
    with open(consensus_file_path, "w") as fasta_file_object:
        SeqIO.write([snp_seq_record], fasta_file_object, "fasta")
Ejemplo n.º 4
0
def calculate_snp_distances(args):
    """Calculate pairwise sample SNP distances.

    Calculate pairwise SNP distances from the multi-fasta SNP matrix.
    Generate a file of pairwise distances and a file containing a matrix
    of distances.
    This function expects, or creates '(*)', the following files:
            snpma.fasta
            snp_distance_pairwise.tsv*
            snp_distance_matrix.tsv*

    The files are used as follows:
        1. The snpma.fasta input file contains the snp matrix for all samples
        2. The snp_distance_pairwise.tsv output file contains a three column
            tab-separated table of distances between all pairs of samples
        2. The snp_distance_matrix.tsv output file contains a matrix of
           distances between all samples.

    Parameters
    ----------
    args : Namespace
        inputFile: File path (not just file name) for the snp matrix in fasta format
        pairwiseFile: File path (not just file name) of the output pairwise distance file
        matrixFile: File path (not just file name) for the output distance matrix file

    Raises:

    Examples:
    args = argparse.Namespace
    args.inputFile = 'snpma.fasta'
    args.pairwiseFile = 'snp_distance_pairwise.tsv'
    args.matrixFile = 'snp_distance_matrix.tsv'
    calculate_snp_distances(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate arguments
    #==========================================================================
    input_file = args.inputFile
    pairwise_file = args.pairwiseFile
    matrix_file = args.matrixFile
    force_flag = args.forceFlag

    bad_file_count = utils.verify_existing_input_files("SNP matrix file",
                                                       [input_file])
    if bad_file_count > 0:
        utils.global_error(
            "Error: cannot calculate sequence distances without the snp matrix file."
        )

    if not pairwise_file and not matrix_file:
        utils.global_error("Error: no output file specified.")

    #==========================================================================
    # Check freshness
    #==========================================================================
    rebuild_pairwise_file = pairwise_file and utils.target_needs_rebuild(
        [input_file], pairwise_file)
    rebuild_matrix_file = matrix_file and utils.target_needs_rebuild(
        [input_file], matrix_file)
    if force_flag or rebuild_pairwise_file or rebuild_matrix_file:

        #------------------------------
        # Read in snp matrix file
        #------------------------------
        seqs = {}
        with open(input_file) as ifile:
            for line in ifile:
                line = line.rstrip('\n')
                if line.startswith('>'):
                    curr_sample = line.lstrip('>')
                    seqs[curr_sample] = ''
                else:
                    seqs[curr_sample] += str(line)

        #------------------------------
        # Count mismatches
        #------------------------------
        verbose_print(
            "# %s %s" %
            (utils.timestamp(), "Calculating all pairwise distances"))
        ids = sorted(seqs.keys())
        pairwise_mismatches = dict()  # tuple (seq1 id, seq2 id) -> int

        for id1, id2 in itertools.combinations(ids, 2):
            mismatches = utils.calculate_sequence_distance(
                seqs[id1], seqs[id2])
            pairwise_mismatches[(id1, id2)] = mismatches
            pairwise_mismatches[(id2, id1)] = mismatches

        #------------------------------
        # Print distance files
        #------------------------------
        if pairwise_file:
            with open(pairwise_file, 'w') as p_out:
                p_out.write('%s\n' % '\t'.join(['Seq1', 'Seq2', 'Distance']))
                for id1, id2 in itertools.product(ids, ids):
                    mismatches = pairwise_mismatches.get(
                        (id1, id2), 0)  # zero when id1=id2
                    p_out.write("%s\t%s\t%i\n" % (id1, id2, mismatches))

        if matrix_file:
            with open(matrix_file, 'w') as m_out:
                m_out.write('\t%s\n' % '\t'.join(ids))  # matrix header
                # write table of mismatches
                for id1 in ids:
                    mismatches = [
                        pairwise_mismatches.get((id1, id2), 0) for id2 in ids
                    ]
                    mismatch_strs = map(str, mismatches)
                    m_out.write("%s\t%s\n" % (id1, '\t'.join(mismatch_strs)))

    else:
        utils.verbose_print(
            "Distance files have already been freshly built.  Use the -f option to force a rebuild."
        )
Ejemplo n.º 5
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get(
            "SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print(
            "# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file],
                                               sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get(
            "SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1]  # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(
                sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file],
                                                   deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error(
                    "Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable."
                )
            else:
                version_str = utils.extract_version_str(
                    "Picard",
                    "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1"
                )
                picard_jvm_extra_params = os.environ.get(
                    "PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get(
                    "PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(
                    sample_dir, "duplicate_reads_metrics.txt"
                ) + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file,
                                                   "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan",
                "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2"
            )
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
Ejemplo n.º 6
0
def create_snp_reference_seq(options_dict):
    """Write reference sequence bases at SNP locations to a fasta file.

    Description:
    Write reference sequence bases at SNP locations to a fasta file.
    This function expects, or creates '(*)', the following files:
            reference.fasta
            snplist.txt
            referenceSNP.fasta (*)

    The files are used as follows:
        1. The reference.fasta input file contains the whole-genome reference
           bases.
        2. The snplist.txt input file contains the list of SNP positions across
           all the samples.
        2. The referenceSNP.fasta output file contains the reference bases at
           the identified SNP locations.

    The snplist.txt file is created outside of this function.  The package
        documentation provides an example of creating this file based on the
        lambda_virus sequence that is used as one test for this package.

    Args:
        referenceFile: File path (not just file name) for reference sequence
            (in fasta format
        snpListFile: File path (not just file name) of text format list of SNP
            positions
        snpRefFile: File path (not just file name) for the SNP reference
            sequence file.

    Raises:

    Examples:
    options_dict = {'referenceFile':'reference.fasta',
                    'snpListFile':'snplist.txt',
                    'snpRefFile':'referenceSNP.fasta'
                   }
    create_snp_reference_seq(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    #    Write reference sequence bases at SNP locations to a fasta file.
    #==========================================================================
    reference_file = options_dict['referenceFile']
    snp_list_file_path = options_dict['snpListFile']
    snp_ref_seq_path = options_dict['snpRefFile']

    #==========================================================================
    # Verify input files exist
    #==========================================================================
    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot create the snp reference sequence without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [reference_file])
    if bad_file_count > 0:
        utils.global_error("Error: cannot create the snp reference sequence without the reference fasta file.")

    #==========================================================================
    # Find the reference bases at the snp positions
    #==========================================================================
    source_files = [reference_file, snp_list_file_path]
    if options_dict['forceFlag'] or utils.target_needs_rebuild(source_files, snp_ref_seq_path):
        utils.write_reference_snp_file(reference_file, snp_list_file_path, snp_ref_seq_path)
        verbose_print("")
    else:
        verbose_print("SNP reference sequence %s has already been freshly built.  Use the -f option to force a rebuild." % snp_ref_seq_path)

    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 7
0
def create_snp_list(options_dict):
    """Create SNP list file

    Description:
    Create the SNP list -- the list of positions where variants were found
    and the corresponding list of samples having a variant at each position.
    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                ...
            snplist.txt (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files are used to construct the
           SNP position list.
        3. The snplist.txt output file contains the union of the SNP positions
           and sample names extracted from all the var.flt.vcf files.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        snpListFile: File path (not just file name) of text format list
            of SNP positions

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'vcfFileName':'var.flt.vcf'
                    'snpListFile':'snplist.txt',
                   }
    create_snp_list(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    #==========================================================================
    # Validate inputs
    #==========================================================================
    snp_list_file_path = options_dict['snpListFile']
    vcf_file_name = options_dict['vcfFileName']
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    #==========================================================================
    # Read in all vcf files and process into dict of SNPs passing various
    # criteria. Do this for each sample. Write to file.
    #==========================================================================
    if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path):
        snp_dict = dict()
        excluded_sample_directories = set()
        for sample_dir, vcf_file_path in zip(sorted_list_of_sample_directories, list_of_vcf_files):

            if not os.path.isfile(vcf_file_path):
                continue
            if os.path.getsize(vcf_file_path) == 0:
                continue

            verbose_print("Processing VCF file %s" % vcf_file_path)
            sample_name = os.path.basename(os.path.dirname(vcf_file_path))
            snp_set = utils.convert_vcf_file_to_snp_set(vcf_file_path)
            max_snps = options_dict['maxSnps']
            if max_snps >= 0 and len(snp_set) > max_snps:
                verbose_print("Excluding sample %s having %d snps." % (sample_name, len(snp_set)))
                excluded_sample_directories.add(sample_dir)
                continue

            for key in snp_set:
                if key not in snp_dict:
                    sample_list = [sample_name]
                    snp_dict[key] = sample_list
                else:
                    sample_list = snp_dict[key]
                    sample_list.append(sample_name)

        verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files)))
        utils.write_list_of_snps(snp_list_file_path, snp_dict)
        verbose_print("")

        #==========================================================================
        # Write the filtered list of sample directories
        #==========================================================================
        sample_directories_list_path = sample_directories_list_path + ".filtered"
        with open(sample_directories_list_path, "w") as filtered_samples_file_object:
            # Loop over the unsorted list to keep the order of samples the same as the original.
            # This will keep the same HPC log file suffix number.
            for sample_dir in unsorted_list_of_sample_directories:
                if sample_dir not in excluded_sample_directories:
                    filtered_samples_file_object.write("%s\n" % sample_dir)
    else:
        verbose_print("SNP list %s has already been freshly built.  Use the -f option to force a rebuild." % snp_list_file_path)
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 8
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*

    The reverse fastq file is optional.
    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file",
                                       fastq_files,
                                       error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# %s has already been aligned to %s.  Use the -f option to force a rebuild."
            % (sample_id, reference_id))
        return

    #==========================================================================
    # Construct the command line to execute bowtie2 or smalt
    #==========================================================================

    # The read group identifies reads from a single run and lane
    read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1,
                                                      sample_id)

    # Default to 8 cores on HPC or all cpu cores on workstation
    if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"):
        num_cores = 8
    else:
        num_cores = psutil.cpu_count()

    num_cores_param = ""

    if snp_pipeline_aligner == "bowtie2":
        version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

        # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores
        bowtie2_align_extra_params = os.environ.get(
            "Bowtie2Align_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(
                bowtie2_align_extra_params, "-p"):
            num_cores_param = "-p " + str(num_cores)

        # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
        # The read group tags are used by some downstream tools, like Picard and GATK.
        read_group_params = ""
        if read_group_tags:
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

        # Substitute the default parameters if the user did not specify bowtie parameters
        bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q"

        # Build the command with options depending on whether the fastq files are paired
        command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path
        if sample_fastq_file2:
            command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
        else:
            command_line += " -U " + sample_fastq_file1

    elif snp_pipeline_aligner == "smalt":
        version_str = utils.extract_version_str("smalt", "smalt version")

        # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores
        smalt_align_extra_params = os.environ.get(
            "SmaltAlign_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(
                smalt_align_extra_params, "-n"):
            num_cores_param = "-n " + str(num_cores)

        # Substitute the default parameters if the user did not specify smalt parameters
        smalt_align_params = smalt_align_extra_params or "-O"

        # Don't use the -i 1000 option if the fastq file is unpaired
        if not sample_fastq_file2:
            smalt_align_params = re.sub(
                "-i[ ]+[0-9]+", '',
                smalt_align_extra_params)  # regex substitute

        command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (
            sample_fastq_file2 or "")

    #==========================================================================
    # Run the command to execute bowtie2 or smalt
    #==========================================================================
    verbose_print("# Align sequence %s to reference %s" %
                  (sample_id, reference_id))
    verbose_print("# %s %s" % (utils.timestamp(), command_line))
    verbose_print("# %s" % version_str)
    command.run(command_line, sam_file)

    #==========================================================================
    # When using smalt, assign read groups in a separate step.
    # This is already done when using bowtie2.
    #==========================================================================
    if snp_pipeline_aligner == "smalt" and read_group_tags:
        smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
        shutil.move(sam_file, smalt_sam_file)
        version_str = utils.extract_version_str(
            "Picard",
            "java  picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1"
        )
        jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
        command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups"
        command_line += " I=" + smalt_sam_file
        command_line += " O=" + sam_file
        command_line += " RGID=" + read_group_tags.ID
        command_line += " RGSM=" + read_group_tags.SM
        command_line += " RGLB=" + read_group_tags.LB
        command_line += " RGPL=" + read_group_tags.PL
        command_line += " RGPU=" + read_group_tags.PU
        verbose_print("")
        verbose_print("# Assign read group id %s" % (read_group_tags.ID))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
Ejemplo n.º 9
0
def merge_vcfs(args):
    """Merge the per-sample VCF files.

    Execute an external program (bcftools merge)) to merge the VCF files.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            samples
                sample_name_one/consensus.vcf
            snpma.vcf*

    All the input files are created outside of this function.  Before
    running this command, the vcf file for each sample must be created by the
    call_consensus.py script.

    The package documentation provides an example of preparing these files based
    on the lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        sampleDirsFile : Path to file containing a list of directories -- one per sample
        vcfFileName : File name of the vcf files which must exist in each of the sample directories
        mergedVcfFile : Path to the output merged multi-vcf file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    sample_directories_list_path = args.sampleDirsFile
    vcf_file_name = args.vcfFileName
    merged_vcf_file = args.mergedVcfFile

    utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global")

    with open(sample_directories_list_path, "r") as f:
        sample_directories = [line.rstrip() for line in f]
    sample_directories = [d for d in sample_directories if d]
    vcf_files = [os.path.join(d, vcf_file_name) for d in sample_directories]

    good_vcf_files = []
    for vcf_file in vcf_files:
        bad = utils.verify_non_empty_input_files("Sample vcf file", [vcf_file], error_handler="sample", continue_possible=True)
        if not bad:
            good_vcf_files.append(vcf_file)

    if len(good_vcf_files) == 0:
        utils.global_error("There are no vcf files to merge.")

    #==========================================================================
    # Check if merge has already been done
    #==========================================================================
    needs_rebuild = utils.target_needs_rebuild(vcf_files, merged_vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Multi-VCF file is already freshly created.  Use the -f option to force a rebuild.")
        return

    #==========================================================================
    # Copy, Compress, Index, Merge
    #==========================================================================

    # If there is only one good sample, just copy the consensus VCF file to the snpma.vcf file
    if len(good_vcf_files) ==  1:
        shutil.copy(good_vcf_files[0], merged_vcf_file)
        return

    # Copy single VCF files to a common directory where the files will be edited
    verbose_print("# %s Copying VCF files to temp directory" % utils.timestamp())
    parent_of_temp_dir = os.path.dirname(merged_vcf_file)
    temp_dir = tempfile.mkdtemp(prefix="tmp.vcf.", dir=parent_of_temp_dir)
    file_copies = []
    for d in sample_directories:
        src_file = os.path.join(d, vcf_file_name)
        if src_file in good_vcf_files:
            dst_file = os.path.join(temp_dir, os.path.basename(d) + ".vcf")
            file_copies.append(dst_file)
            verbose_print("copy %s %s" % (src_file, dst_file))
            #if not os.path.isfile(dst_file) or os.stat(src_file).st_mtime > os.stat(dst_file).st_mtime:
            shutil.copy2(src_file, dst_file)

    # bgzip all the sample vcf files
    verbose_print("# %s Compressing VCF files" % utils.timestamp())
    for file in file_copies:
        verbose_print("bgzip -c %s > %s" % (file, file + ".gz"))
        command.run("bgzip -c " + file, file + ".gz")

    # Index all the zipped sample vcf file
    verbose_print("# %s Indexing VCF files" % utils.timestamp())
    for file in file_copies:
        file += ".gz"
        verbose_print("tabix -f -p vcf " + file)
        command.run("tabix -f -p vcf " + file, sys.stdout)

    # Substitute the default parameters if the user did not specify bcftools parameters
    default_params = "--merge all --info-rules NS:sum"
    bcf_tools_extra_params = os.environ.get("BcftoolsMerge_ExtraParams") or default_params

    # Merge the VCFs
    verbose_print("# %s Merging VCF files" % utils.timestamp())
    command_line = "bcftools merge -o " + merged_vcf_file + ' ' + bcf_tools_extra_params + ' ' + temp_dir + "/*.gz"
    verbose_print(command_line)
    command.run(command_line, sys.stdout)

    # Clean up
    shutil.rmtree(temp_dir)
Ejemplo n.º 10
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print("# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get("SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1] # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error("Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable.")
            else:
                version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1")
                picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
Ejemplo n.º 11
0
def index_ref(args):
    """Index the reference genome.

    Execute an external program (bowtie2 or smalt) to create an index for the
    reference genome to be used during subsequent alignment.  Execute samtools
    to create the faidx index file to be used during subsequent pileups.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta         # input fasta
                referenceFile.#.bt2*        # bowtie2 output
                referenceFile.rev.#.bt2*    # bowtie2 output
                referenceFile.sma*          # smalt output
                referenceFile.smi*          # smalt output
                referenceFile.fasta.fai*    # samtools faidx output

    The input fasta file is created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    # Create index file for reference
    if snp_pipeline_aligner == "bowtie2":
        target_file = reference_base_path + ".rev.1.bt2"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bowtie index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")
            bowtie2_build_extra_params = os.environ.get(
                "Bowtie2Build_ExtraParams") or ""
            command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    elif snp_pipeline_aligner == "smalt":
        target_file = reference_base_path + ".smi"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Smalt index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("smalt", "smalt version")
            smalt_index_extra_params = os.environ.get(
                "SmaltIndex_ExtraParams") or ""
            command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    # Create the samtools fai index
    verbose_print("")
    target_file = reference_file_path + ".fai"
    needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                               target_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# SAMtools fai index %s is already freshly built.  Use the -f option to force a rebuild."
            % target_file)
    else:
        version_str = utils.extract_version_str("samtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_faidx_extra_params = os.environ.get(
            "SamtoolsFaidx_ExtraParams") or ""
        command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.global_error_on_missing_file(target_file, "samtools faidx")
Ejemplo n.º 12
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
Ejemplo n.º 13
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.
        mode:
            all = Dense regions found in any sample are filtered from all samples.
            each = Dense regions found in any sample are filtered independently from samples.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Get arguments from Argparse namespace
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    ref_fasta_path = args.refFastaFile
    force_flag = args.forceFlag
    vcf_file_name = args.vcfFileName
    edge_length = args.edgeLength
    window_size_list = args.windowSizeList
    max_num_snps_list = args.maxSnpsList
    out_group_list_path = args.outGroupFile
    filter_across_samples = args.mode == "all"

    #==========================================================================
    # Validate inputs
    #==========================================================================
    bad_file_count = utils.verify_non_empty_input_files(
        "File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path,
              "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [
            line.rstrip() for line in sample_directories_list_file
        ]
    unsorted_list_of_sample_directories = [
        d for d in unsorted_list_of_sample_directories if d
    ]
    sorted_list_of_sample_directories = sorted(
        unsorted_list_of_sample_directories)

    list_of_vcf_files = [
        os.path.join(dir, vcf_file_name)
        for dir in sorted_list_of_sample_directories
    ]
    bad_file_count = utils.verify_non_empty_input_files(
        "VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." %
                           bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." %
                           bad_file_count,
                           continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files(
        "Reference file", [ref_fasta_path])
    if bad_file_count > 0:
        utils.global_error(None)

    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files(
            "File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            # There are outgroup samples
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [
                    line.rstrip() for line in out_group_list_file
                ]
            sorted_list_of_outgroup_samples = sorted(
                unsorted_list_of_outgroup_samples)
        except:
            utils.global_error(
                "Error: Cannot open the file containing the list of outgroup samples!"
            )

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(ref_fasta_path, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            # build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
    except:
        utils.global_error(
            "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file."
        )
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Filter regions
    #==========================================================================
    if filter_across_samples:
        filter_regions_across_samples(list_of_vcf_files, contig_length_dict,
                                      sorted_list_of_outgroup_samples,
                                      force_flag, edge_length,
                                      window_size_list, max_num_snps_list,
                                      ref_fasta_path, out_group_list_path)
    else:
        filter_regions_per_sample(list_of_vcf_files, contig_length_dict,
                                  sorted_list_of_outgroup_samples, force_flag,
                                  edge_length, window_size_list,
                                  max_num_snps_list, ref_fasta_path,
                                  out_group_list_path)
Ejemplo n.º 14
0
def create_snp_reference_seq(args):
    """Write reference sequence bases at SNP locations to a fasta file.

    Write reference sequence bases at SNP locations to a fasta file.
    This function expects, or creates '(*)', the following files:
            reference.fasta
            snplist.txt
            referenceSNP.fasta (*)

    The files are used as follows:
        1. The reference.fasta input file contains the whole-genome reference
           bases.
        2. The snplist.txt input file contains the list of SNP positions across
           all the samples.
        2. The referenceSNP.fasta output file contains the reference bases at
           the identified SNP locations.

    The snplist.txt file is created outside of this function.  The package
        documentation provides an example of creating this file based on the
        lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : Namespace
        referenceFile: File path (not just file name) for reference sequence in fasta format
        snpListFile: File path (not just file name) of text format list of SNP positions
        snpRefFile: File path (not just file name) for the SNP reference sequence file.

    Raises:

    Examples:
    args = argparse.Namespace
    args.referenceFile = 'reference.fasta'
    args.snpListFile = 'snplist.txt'
    args.snpRefFile = 'referenceSNP.fasta'
    create_snp_reference_seq(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    #    Write reference sequence bases at SNP locations to a fasta file.
    #==========================================================================
    reference_file = args.referenceFile
    snp_list_file_path = args.snpListFile
    snp_ref_seq_path = args.snpRefFile

    #==========================================================================
    # Verify input files exist
    #==========================================================================
    bad_file_count = utils.verify_existing_input_files("Snplist file",
                                                       [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error(
            "Error: cannot create the snp reference sequence without the snplist file."
        )

    bad_file_count = utils.verify_non_empty_input_files(
        "Reference file", [reference_file])
    if bad_file_count > 0:
        utils.global_error(
            "Error: cannot create the snp reference sequence without the reference fasta file."
        )

    #==========================================================================
    # Find the reference bases at the snp positions
    #==========================================================================
    source_files = [reference_file, snp_list_file_path]
    if args.forceFlag or utils.target_needs_rebuild(source_files,
                                                    snp_ref_seq_path):
        utils.write_reference_snp_file(reference_file, snp_list_file_path,
                                       snp_ref_seq_path)
    else:
        verbose_print(
            "SNP reference sequence %s has already been freshly built.  Use the -f option to force a rebuild."
            % snp_ref_seq_path)
Ejemplo n.º 15
0
def call_consensus(options_dict):
    """Call the consensus base for a sample

    Call the consensus base for a sample at the positions where SNPs were found
    in any of the samples.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            snplist.txt
            samples
                sample_name_one/reads.all.pileup
                sample_name_one/consensus.fasta (*)

    The files are used as follows:
        1. The snplist.txt input file contains the list of SNP positions
           extracted from all the var.flt.vcf files combined.
        2. The reads.all.pileup input file is a pileups at all positions
           used to determine the nucleotide base at each SNP position.
        3. The consensus.fasta output file contains the SNP calls for each
           sequence, arranged as a fasta file with one sequence per sample.

    The snplist.txt, and reads.snp.pileup are created outside of this function.
       The package documentation provides an example
        of creating these files based on the lambda_virus sequence that is used
        as one test for this package.

    Args:
        forceFlag : boolean
            flag to force processing even when result file already exists and
            is newer than inputs
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        allPileupFile : str
            Relative or absolute path to the genome-wide pileup file for this
            sample
        consensusFile : str
            Output file. Relative or absolute path to the consensus fasta file
            for this sample.
        minBaseQual : int
            Mimimum base quality score to count a read. All other snp filters
            take effect after the low-quality reads are discarded.
        minConsFreq : float
            Consensus frequency. Mimimum fraction of high-quality reads
            supporting the consensus to make a call.
        minConsStrdDpth : int
            Consensus strand depth. Minimum number of high-quality reads
            supporting the consensus which must be present on both the
            forward and reverse strands to make a call.
        minConsStrdBias : float
            Strand bias. Minimum fraction of the high-quality
            consensus-supporting reads which must be present on both the
            forward and reverse strands to make a call. The numerator of this
            fraction is the number of high-quality consensus-supporting reads
            on one strand.  The denominator is the total number of high-quality
            consensus-supporting reads on both strands combined.

    Raises:

    Examples:
    options_dict = {'snpListFile':'snplist.txt',
                    'allPileupFile':'reads.all.pileup',
                    'consensusFile':'consensus.fasta',
                    'minBaseQual':15,
                    'minConsFreq':0.6,
                    'minConsStrdDpth':4,
                    'minConsStrdBias':0.10,
                    'vcfFailedSnpGt':'.'
                   }
    call_consensus(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    snp_list_file_path = options_dict['snpListFile']
    all_pileup_file_path = options_dict['allPileupFile']
    sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path))
    sample_name = os.path.basename(sample_directory)
    consensus_file_path = options_dict['consensusFile']
    consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path))
    vcf_file_name = options_dict['vcfFileName']
    vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None

    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot call consensus without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path])
    if bad_file_count > 0:
        utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False)

    # Check if the result is already fresh
    source_files = [snp_list_file_path, all_pileup_file_path]
    if not options_dict['forceFlag'] and not utils.target_needs_rebuild(source_files, consensus_file_path):
        verbose_print("Consensus call file %s has already been freshly built.  Use the -f option to force a rebuild." % consensus_file_path)
        verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
        return

    # Load the list of which positions to called
    snp_list = utils.read_snp_position_list(snp_list_file_path)
    snplist_length = len(snp_list)
    verbose_print("snp position list length = %d" % snplist_length)

    # Call consensus. Write results to file.
    position_consensus_base_dict = dict()

    caller = pileup.ConsensusCaller(options_dict['minConsFreq'],
                                    options_dict['minConsStrdDpth'],
                                    options_dict['minConsStrdBias'])
    snp_positions = set(snp_list)
    parse_positions = None if options_dict['vcfAllPos'] else snp_positions
    pileup_reader = pileup.Reader(all_pileup_file_path,
                                  options_dict['minBaseQual'],
                                  parse_positions)
    if vcf_file_name:
        writer = vcf_writer.SingleSampleWriter(vcf_file_path, options_dict['vcfPreserveRefCase'])
        filters = caller.get_filter_descriptions()
        writer.write_header(sample_name, filters, options_dict['vcfRefName'])
    for pileup_record in pileup_reader:
        chrom = pileup_record.chrom
        pos = pileup_record.position
        consensus_base, fail_reasons = caller.call_consensus(pileup_record)
        if (chrom, pos) in snp_positions:
            if fail_reasons:
                position_consensus_base_dict[(chrom, pos)] = '-'
            else:
                position_consensus_base_dict[(chrom, pos)] = consensus_base

        if vcf_file_name:
            writer.write_from_pileup(pileup_record, fail_reasons, options_dict['vcfFailedSnpGt'])
    if vcf_file_name:
        writer.close()

    verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict)))

    consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list]
    consensus_str = ''.join(consensus_list)
    snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="")

    # Write the consensus calls to a fasta file
    with open(consensus_file_path, "w") as fasta_file_object:
        SeqIO.write([snp_seq_record], fasta_file_object, "fasta")

    verbose_print("")
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 16
0
def parse_argument_list(argv):
    """Parse command line arguments.

    Parameters
    ----------
    argv : list
        List of command line arguments, usually sys.argv[1:].

    Returns
    -------
    args : Namespace
        Command line arguments are stored as attributes of a Namespace.
    """
    # Create the top-level parser
    description = """The CFSAN SNP Pipeline is a collection of tools using reference-based
                     alignments to call SNPs for a set of samples."""

    # Override the default help width
    formatter_class = lambda prog: RawArgumentDefaultsHelpFormatter(prog, width=HELP_WIDTH)

    parser = HelpParser(description=description)
    parser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparsers = parser.add_subparsers(dest="subparser_name", help=None, metavar="subcommand       ")
    subparsers.required = True

    # -------------------------------------------------------------------------
    # Create the parser for the "run" command
    # -------------------------------------------------------------------------

    def mirror_mode(value):
        # Accept either upper or lowercase
        lvalue = str(value).lower()
        if lvalue not in ["soft", "hard", "copy"]:
            raise argparse.ArgumentTypeError("Invalid mirror mode: " + value)
        return lvalue

    def job_queue_manager(value):
        # Accept either upper or lowercase
        lvalue = str(value).lower()
        if lvalue not in ["grid", "torque"]:
            raise argparse.ArgumentTypeError("Only the grid and torque job queue managers are currently supported.")
        return lvalue

    description = """Run the SNP Pipeline on a specified data set."""
    subparser = subparsers.add_parser("run", help="This do-it-all script runs all the pipeline steps", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="referenceFile", type=str, help="Relative or absolute path to the reference fasta file")
    subparser.add_argument("-f", "--force", dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-m", "--mirror", dest="mirror", type=mirror_mode, metavar="MODE",
        help="""
            Raw:Create a mirror copy of the reference directory and
                all the sample directories.  Use this option to avoid
                polluting the reference directory and sample
                directories with the intermediate files generated by
                the snp pipeline.  A "reference" subdirectory and a
                "samples" subdirectory are created under the output
                directory (see the -o option).  One directory per
                sample is created under the "samples" directory.
                Three suboptions allow a choice of how the reference
                and samples are mirrored.
                  -m soft : creates soft links to the fasta and fastq
                            files instead of copying
                  -m hard : creates hard links to the fasta and fastq
                            files instead of copying
                  -m copy : copies the fasta and fastq files
               """)

    subparser.add_argument("-c", "--conf", dest="configFile", type=str, metavar="FILE",
        help="""
            Raw:Relative or absolute path to a configuration file for
                overriding defaults and defining extra parameters for
                the tools and scripts within the pipeline.
                Note: A default parameter configuration file named
                      "snppipeline.conf" is used whenever the pipeline
                      is run without the -c option.  The configuration
                      file used for each run is copied into the log
                      directory, capturing the parameters used during
                      the run.""")

    subparser.add_argument("-Q", "--queue_mgr", dest="jobQueueMgr", type=job_queue_manager, metavar="grid|torque",
        help="""Job queue manager for remote parallel job execution in an HPC environment.
                Currently "grid" and "torque" are supported.  If not specified, the pipeline
                will execute locally.""")

    subparser.add_argument("-o", "--out_dir", dest="workDir", type=str, default=".", metavar="DIR",
        help="""Output directory for the result files.
                Additional subdirectories are automatically created under the output
                directory for logs files and the mirrored samples and reference files
                (see the -m option).  The output directory will be created if it does
                not already exist.  If not specified, the output files are written to
                the current working directory.  If you re-run the pipeline on previously
                processed samples, and specify a different output directory, the
                pipeline will not rebuild everything unless you either force a rebuild
                (see the -f option) or you request mirrored inputs (see the -m option).""")

    samples_group = subparser.add_mutually_exclusive_group(required=True)

    samples_group.add_argument("-s", "--samples_dir", dest="samplesDir", type=str, metavar="DIR",
        help="""
            Raw:Relative or absolute path to the parent directory of
                all the sample directories.  The -s option should be
                used when all the sample directories are in
                subdirectories immediately below a parent directory.
                Note: You must specify either the -s or -S option, but
                      not both.
                Note: The specified directory should contain only a
                      collection of sample directories, nothing else.
                Note: Unless you request mirrored inputs, see the
                      -m option, additional files will be written to
                      each of the sample directories during the
                      execution of the SNP Pipeline""")

    samples_group.add_argument("-S", "--samples_file", dest="samplesFile", type=str, metavar="FILE",
        help="""
            Raw:Relative or absolute path to a file listing all of the
                sample directories.  The -S option should be used when
                the samples are not under a common parent directory.
                Note: If you are not mirroring the samples (see the
                      -m option), you can improve parallel processing
                      performance by sorting the the list of
                      directories descending by size, largest first.
                      The -m option automatically generates a sorted
                      directory list.
                Note: You must specify either the -s or -S option, but
                      not both.
                Note: Unless you request mirrored inputs, see the
                      -m option, additional files will be written to
                      each of the sample directories during the
                      execution of the SNP Pipeline""")

    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=run.run)
    subparser.set_defaults(excepthook=run.handle_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "data" command
    # -------------------------------------------------------------------------
    description = """Copy data included with the CFSAN SNP Pipeline to a specified directory."""
    subparser = subparsers.add_parser("data", help="Copy included data to a specified directory", description=description, formatter_class=argparse.RawTextHelpFormatter,
        epilog="""
Example:
# create a new directory "testLambdaVirus" and copy the Lambda virus input data there
$ cfsan_snp_pipeline data lambdaVirusInputs testLambdaVirus
"""
    )

    subparser.add_argument("whichData",
    metavar="whichData",
    choices=["lambdaVirusInputs", "lambdaVirusExpectedResults",
             "agonaInputs", "agonaExpectedResults",
             "listeriaInputs", "listeriaExpectedResults",
             "configurationFile"],
    help="""    Which of the supplied data sets to copy.  The choices are:
        lambdaVirusInputs          : Input reference and fastq files
        lambdaVirusExpectedResults : Expected results files
        agonaInputs                : Input reference file
        agonaExpectedResults       : Expected results files
        listeriaInputs             : Input reference file
        listeriaExpectedResults    : Expected results files
        configurationFile          : File of parameters to customize the
                                     SNP pipeline

    Note: the lambda virus data set is complete with input data and expected
    results.  The agona and listeria data sets have the reference genome and
    the expected results, but not the input fastq files, because the files are
    too large to include with the package.
    """)

    subparser.add_argument("destDirectory",
    nargs="?",
    type=str,
    default=".",
    help="""    Destination directory into which the SNP pipeline data files will be copied.
    The data files are copied into the destination directory if the directory
    already exists.  Otherwise the destination directory is created and the
    data files are copied there.  (default: current directory)""")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=data.copy_data)
    subparser.set_defaults(excepthook=None) # keep default exception handler
    subparser.set_defaults(verbose=0)

    # -------------------------------------------------------------------------
    # Create the parser for the "index_ref" command
    # -------------------------------------------------------------------------
    description = """Index the reference genome for subsequent read mapping, and create
                     the faidx index file for subsequent pileups. The output is written
                     to the reference directory."""
    subparser = subparsers.add_parser("index_ref", help="Index the reference", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="referenceFile",    type=str, help="Relative or absolute path to the reference fasta file")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=index_ref.index_ref)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "map_reads" command
    # -------------------------------------------------------------------------
    def threads(value):
        ivalue = int(value)
        num_local_cpu_cores = psutil.cpu_count()
        if ivalue < 1 or ivalue > (2 * num_local_cpu_cores):
            raise argparse.ArgumentTypeError("Number of threads must be between %d and %d" % (1, 2 * num_local_cpu_cores))
        return ivalue

    description = """Align the sequence reads for a specified sample to a specified reference genome.
                     The reads are sorted, duplicates marked, and realigned around indels.
                     The output is written to the file "reads.sorted.deduped.indelrealigned.bam" in the sample directory."""
    subparser = subparsers.add_parser("map_reads", help="Align reads to the reference", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="referenceFile",    type=str, help="Relative or absolute path to the reference fasta file")
    subparser.add_argument(dest="sampleFastqFile1", type=str, help="Relative or absolute path to the fastq file")
    subparser.add_argument(dest="sampleFastqFile2", type=str, help="Optional relative or absolute path to the mate fastq file, if paired", nargs="?")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--threads", dest="threads",   type=threads, default=8, metavar="INT", help="Number of CPU cores to use")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=map_reads.map_reads)
    subparser.set_defaults(excepthook=utils.handle_sample_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "call_sites" command
    # -------------------------------------------------------------------------
    description = "Find the sites with high-confidence SNPs in a sample."
    subparser = subparsers.add_parser("call_sites", help="Find the sites with high-confidence SNPs in a sample", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="referenceFile",    type=str, help="Relative or absolute path to the reference fasta file")
    subparser.add_argument(dest="sampleDir", type=str, help="Relative or absolute directory of the sample")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=call_sites.call_sites)
    subparser.set_defaults(excepthook=utils.handle_sample_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "filter_regions" command
    # -------------------------------------------------------------------------
    description = "Remove abnormally dense SNPs from the input VCF file, save the reserved SNPs into a new VCF file, and save the removed SNPs into another VCF file."
    epilog = 'You can filter snps more than once by specifying multiple window sizes and max snps.  For example "-m 3 2 -w 1000 100" will filter more than 3 snps in 1000 bases and also more than 2 snps in 100 bases.'
    subparser = subparsers.add_parser("filter_regions", help="Remove abnormally dense SNPs from all samples", description=description, formatter_class=formatter_class, epilog=epilog)
    subparser.add_argument(                       dest="sampleDirsFile", type=str,                                                help="Relative or absolute path to file containing a list of directories -- one per sample")
    subparser.add_argument(                       dest="refFastaFile",   type=str,                                                help="Relative or absolute path to the reference fasta file")
    subparser.add_argument("-f", "--force",       dest="forceFlag",      action="store_true",                                     help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-n", "--vcfname",     dest="vcfFileName",    type=str, default="var.flt.vcf", metavar="NAME",         help="File name of the input VCF files which must exist in each of the sample directories")
    subparser.add_argument("-l", "--edge_length", dest="edgeLength",     type=int, default=500,           metavar="EDGE_LENGTH",  help="The length of the edge regions in a contig, in which all SNPs will be removed.")
    subparser.add_argument("-w", "--window_size", dest="windowSizeList", type=int, default=[1000], nargs='*', metavar="WINDOW_SIZE",  help="The length of the window in which the number of SNPs should be no more than max_num_snp.")
    subparser.add_argument("-m", "--max_snp",     dest="maxSnpsList",    type=int, default=[3],    nargs='*', metavar="MAX_NUM_SNPs", help="The maximum number of SNPs allowed in a window.")
    subparser.add_argument("-g", "--out_group",   dest="outGroupFile",   type=str, default=None,          metavar="OUT_GROUP",    help="Relative or absolute path to the file indicating outgroup samples, one sample ID per line.")
    subparser.add_argument("-v", "--verbose",     dest="verbose",        type=int, default=1,             metavar="0..5",         help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=filter_regions.filter_regions)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "merge_sites" command
    # -------------------------------------------------------------------------
    description = "Combine the SNP positions across all samples into a single unified SNP list file identifing the positions and sample names where SNPs were called."
    subparser = subparsers.add_parser("merge_sites", help="Prepare the list of sites having SNPs", description=description, formatter_class=formatter_class)
    subparser.add_argument(                   dest="sampleDirsFile", type=str,                        help="Relative or absolute path to file containing a list of directories -- one per sample")
    subparser.add_argument(                   dest="filteredSampleDirsFile", type=str,                help="Relative or absolute path to the output file that will be created containing the filtered list of sample directories -- one per sample.  The samples in this file are those without an excessive number of snps.  See the --maxsnps parameter.")
    subparser.add_argument("-f", "--force",   dest="forceFlag",      action="store_true",             help="Force processing even when result file already exists and is newer than inputs")
    subparser.add_argument("-n", "--vcfname", dest="vcfFileName",    type=str, default="var.flt.vcf", metavar="NAME", help="File name of the VCF files which must exist in each of the sample directories")
    subparser.add_argument(      "--maxsnps", dest="maxSnps",        type=int, default=-1,            metavar="INT",  help="Exclude samples having more than this maximum allowed number of SNPs. Set to -1 to disable this function.")
    subparser.add_argument("-o", "--output",  dest="snpListFile",    type=str, default="snplist.txt", metavar="FILE", help="Output file.  Relative or absolute path to the SNP list file")
    subparser.add_argument("-v", "--verbose", dest="verbose",        type=int, default=1,             metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=merge_sites.merge_sites)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "call_consensus" command
    # -------------------------------------------------------------------------
    def minConsFreq(value):
        fvalue = float(value)
        if fvalue <= 0.5 or fvalue > 1:
            raise argparse.ArgumentTypeError("Minimum consensus frequency must be > 0.5 and <= 1.0")
        return fvalue

    def minConsStrdBias(value):
        fvalue = float(value)
        if fvalue < 0.0 or fvalue > 0.5:
            raise argparse.ArgumentTypeError("Minimum consensus strand bias must be >= 0.0 and <= 0.5")
        return fvalue

    description="""Call the consensus base for a sample at the specified positions
                   where high-confidence SNPs were previously called in any of the samples.  Generates
                   a single-sequence fasta file with one base per specified position."""

    help = dict()
    help["allPileupFile"]  = """Relative or absolute path to the genome-wide pileup file for this sample."""
    help["force"]          = """Force processing even when result file already exists and is newer than inputs."""
    help["snpListFile"]    = """Relative or absolute path to the SNP list file across all samples."""
    help["excludeFile"]    = """VCF file of positions to exclude."""
    help["output"]         = """Output file. Relative or absolute path to the consensus fasta file for this sample."""
    help["minBaseQual"]    = """Mimimum base quality score to count a read. All other snp filters take effect after the low-quality reads are discarded."""
    help["minConsFreq"]    = """Consensus frequency. Mimimum fraction of high-quality reads supporting the consensus to make a call."""
    help["minConsDpth"]    = """Consensus depth. Minimum number of high-quality reads supporting the consensus to make a call."""
    help["minConsStrdDpth"]= """Consensus strand depth. Minimum number of high-quality reads supporting the consensus which must be present on both the
                                forward and reverse strands to make a call."""
    help["minConsStrdBias"]= """Strand bias. Minimum fraction of the high-quality consensus-supporting reads which must be present on both the
                                forward and reverse strands to make a call. The numerator of this fraction is the number of high-quality
                                consensus-supporting reads on one strand.  The denominator is the total number of high-quality consensus-supporting
                                reads on both strands combined."""
    help["vcfFileName"]    = """VCF Output file name. If specified, a VCF file with this file name will be created in the same directory as the
                                consensus fasta file for this sample."""
    help["vcfRefName"]     = """Name of the reference file.  This is only used in the generated VCF file header."""
    help["vcfAllPos"]      = """Flag to cause VCF file generation at all positions, not just the snp positions.  This has no effect on
                                the consensus fasta file, it only affects the VCF file.  This capability is intended primarily as a diagnostic tool and
                                enabling this flag will greatly increase execution time."""
    help["vcfPreserveRefCase"] = """Flag to cause the VCF file generator to emit each reference base in uppercase/lowercase as it appears in the reference
                                    sequence file.  If not specified, the reference base is emitted in uppercase."""
    help["vcfFailedSnpGt"] = """Controls the VCF file GT data element when a snp fails filters.  Possible values:
                                .) The GT element will be a dot, indicating unable to make a call.
                                0) The GT element will be 0, indicating the reference base.
                                1) The GT element will be the ALT index of the most commonly occuring base, usually 1."""

    help["verbose"]        = """Verbose message level (0=no info, 5=lots)"""

    subparser = subparsers.add_parser("call_consensus", help="Call the consensus base at high-confidence sites", description=description, formatter_class=formatter_class)
    subparser.add_argument(                              dest="allPileupFile",      type=str,                                                        help=help["allPileupFile"])
    subparser.add_argument("-f", "--force",              dest="forceFlag",          action="store_true",                                             help=help["force"])
    subparser.add_argument("-l", "--snpListFile",        dest="snpListFile",        type=str,            default="snplist.txt",      metavar="FILE", help=help["snpListFile"])
    subparser.add_argument("-e", "--excludeFile",        dest="excludeFile",        type=str,            default=None,               metavar="FILE", help=help["excludeFile"])
    subparser.add_argument("-o", "--output",             dest="consensusFile",      type=str,            default="consensus.fasta",  metavar="FILE", help=help["output"])
    subparser.add_argument("-q", "--minBaseQual",        dest="minBaseQual",        type=int,            default=0,                  metavar="INT",  help=help["minBaseQual"])
    subparser.add_argument("-c", "--minConsFreq",        dest="minConsFreq",        type=minConsFreq,    default=0.60,               metavar="FREQ", help=help["minConsFreq"])
    subparser.add_argument("-D", "--minConsDpth",        dest="minConsDpth",        type=int,            default=1,                  metavar="INT",  help=help["minConsDpth"])
    subparser.add_argument("-d", "--minConsStrdDpth",    dest="minConsStrdDpth",    type=int,            default=0,                  metavar="INT",  help=help["minConsStrdDpth"])
    subparser.add_argument("-b", "--minConsStrdBias",    dest="minConsStrdBias",    type=minConsStrdBias,default=0,                  metavar="FREQ", help=help["minConsStrdBias"])
    subparser.add_argument(      "--vcfFileName",        dest="vcfFileName",        type=str,            default=None,               metavar="NAME", help=help["vcfFileName"])
    subparser.add_argument(      "--vcfRefName",         dest="vcfRefName",         type=str,            default="Unknown reference",metavar="NAME", help=help["vcfRefName"])
    subparser.add_argument(      "--vcfAllPos",          dest="vcfAllPos",          action="store_true",                                             help=help["vcfAllPos"])
    subparser.add_argument(      "--vcfPreserveRefCase", dest="vcfPreserveRefCase", action="store_true",                                             help=help["vcfPreserveRefCase"])
    subparser.add_argument(      "--vcfFailedSnpGt",     dest="vcfFailedSnpGt",     type=str,            default='.',                choices=['.','0','1'], help=help["vcfFailedSnpGt"], )
    subparser.add_argument("-v", "--verbose",            dest="verbose",            type=int,            default=1,                  metavar="0..5", help=help["verbose"])
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=call_consensus.call_consensus)
    subparser.set_defaults(excepthook=utils.handle_sample_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "merge_vcfs" command
    # -------------------------------------------------------------------------
    description = "Merge the consensus vcf files from all samples into a single multi-vcf file for all samples."
    subparser = subparsers.add_parser("merge_vcfs", help="Merge the per-sample VCF files", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="sampleDirsFile", type=str, help="Relative or absolute path to file containing a list of directories -- one per sample")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-n", "--vcfname", dest="vcfFileName",   type=str, default="consensus.vcf", metavar="NAME", help="File name of the vcf files which must exist in each of the sample directories")
    subparser.add_argument("-o", "--output",  dest="mergedVcfFile", type=str, default="snpma.vcf",     metavar="FILE", help="Output file.  Relative or absolute path to the merged multi-vcf file")
    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=merge_vcfs.merge_vcfs)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "snp_matrix" command
    # -------------------------------------------------------------------------
    description = """Create the SNP matrix containing the consensus base for each of the samples
                     at the positions where high-confidence SNPs were found in any of the samples.  The matrix
                     contains one row per sample and one column per SNP position.  Non-SNP
                     positions are not included in the matrix.  The matrix is formatted as a fasta
                     file, with each sequence (all of identical length) corresponding to the SNPs
                     in the correspondingly named sequence."""
    subparser = subparsers.add_parser("snp_matrix", help="Create a matrix of SNPs", description=description, formatter_class=formatter_class)
    subparser.add_argument(                          dest="sampleDirsFile",     type=str,                                               help="Relative or absolute path to file containing a list of directories -- one per sample")
    subparser.add_argument("-f", "--force",          dest="forceFlag",          action="store_true",                                    help="Force processing even when result file already exists and is newer than inputs")
    subparser.add_argument("-c", "--consFileName",   dest="consFileName",       type=str,   default="consensus.fasta",  metavar="NAME", help="File name of the previously created consensus SNP call file which must exist in each of the sample directories")
    subparser.add_argument("-o", "--output",         dest="snpmaFile",          type=str,   default="snpma.fasta",      metavar="FILE", help="Output file.  Relative or absolute path to the SNP matrix file")
    subparser.add_argument("-v", "--verbose",        dest="verbose",            type=int,   default=1,                  metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=snp_matrix.create_snp_matrix)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "distance" command
    # -------------------------------------------------------------------------
    description = "Calculate pairwise SNP distances from the multi-fasta SNP matrix. Generates a file of pairwise distances and a file containing a matrix of distances."
    subparser = subparsers.add_parser("distance", help="Calculate the SNP distances between samples", description=description, formatter_class=formatter_class)
    subparser.add_argument(                  dest="inputFile",    type=str,                           metavar="snpMatrixFile", help="Relative or absolute path to the input multi-fasta SNP matrix file.")
    subparser.add_argument("-f", "--force",  dest="forceFlag",    action="store_true",                                         help="Force processing even when result file already exists and is newer than inputs")
    subparser.add_argument("-p", "--pairs",  dest="pairwiseFile", type=str, default=None,             metavar="FILE",          help="Relative or absolute path to the pairwise distance output file.")
    subparser.add_argument("-m", "--matrix", dest="matrixFile",   type=str, default=None,             metavar="FILE",          help="Relative or absolute path to the distance matrix output file.")
    subparser.add_argument("-v", "--verbose", dest="verbose",     type=int, default=1,                metavar="0..5",          help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=distance.calculate_snp_distances)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "snp_reference" command
    # -------------------------------------------------------------------------
    description = "Write reference sequence bases at SNP locations to a fasta file."
    subparser = subparsers.add_parser("snp_reference", help="Write reference bases at SNP locations to a fasta file", description=description, formatter_class=formatter_class)
    subparser.add_argument(                          dest="referenceFile", type=str,                                               help="Relative or absolute path to the reference bases file in fasta format")
    subparser.add_argument("-f", "--force",          dest="forceFlag",     action="store_true",                                    help="Force processing even when result file already exists and is newer than inputs")
    subparser.add_argument("-l", "--snpListFile",    dest="snpListFile",   type=str, default="snplist.txt",        metavar="FILE", help="Relative or absolute path to the SNP list file")
    subparser.add_argument("-o", "--output",         dest="snpRefFile",    type=str, default="referenceSNP.fasta", metavar="FILE", help="Output file.  Relative or absolute path to the SNP reference sequence file")
    subparser.add_argument("-v", "--verbose",        dest="verbose",       type=int, default=1,                    metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=snp_reference.create_snp_reference_seq)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "collect_metrics" command
    # -------------------------------------------------------------------------
    description = "Collect alignment, coverage, and variant metrics for a single specified sample."
    subparser = subparsers.add_parser("collect_metrics", help="Collect quality and SNP metrics for a sample", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="sampleDir", type=str, help="Relative or absolute directory of the sample")
    subparser.add_argument(dest="referenceFile",    type=str, help="Relative or absolute path to the reference fasta file")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-o", "--output",  dest="metricsFile",            type=str, default="metrics",                   metavar="FILE", help="Output file.  Relative or absolute path to the metrics file")
    subparser.add_argument("-m", "--maxsnps", dest="maxSnps",                type=int, default=-1,                          metavar="INT",  help="Maximum allowed number of SNPs per sample")
    subparser.add_argument("-c", dest="consensusFastaFileName",              type=str, default="consensus.fasta",           metavar="NAME", help="File name of the consensus fasta file which must exist in the sample directory")
    subparser.add_argument("-C", dest="consensusPreservedFastaFileName",     type=str, default="consensus_preserved.fasta", metavar="NAME", help="File name of the consensus preserved fasta file which must exist in the sample directory")
    subparser.add_argument("-v", dest="consensusVcfFileName",                type=str, default="consensus.vcf",             metavar="NAME", help="File name of the consensus vcf file which must exist in the sample directory")
    subparser.add_argument("-V", dest="consensusPreservedVcfFileName",       type=str, default="consensus_preserved.vcf",   metavar="NAME", help="File name of the consensus preserved vcf file which must exist in the sample directory")
    subparser.add_argument("--verbose", dest="verbose",                      type=int, default=1,                           metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=collect_metrics.collect_metrics)
    subparser.set_defaults(excepthook=utils.handle_sample_exception)

    # -------------------------------------------------------------------------
    # Create the parser for the "combine_metrics" command
    # -------------------------------------------------------------------------
    description = """Combine the metrics from all samples into a single table of metrics for all samples.
                     The output is a tab-separated-values file with a row for each sample and a column
                     for each metric.

                     Before running this command, the metrics for each sample must be created with the
                     collect_metrics command."""

    subparser = subparsers.add_parser("combine_metrics", help="Merge the per-sample metrics", description=description, formatter_class=formatter_class)
    subparser.add_argument(dest="sampleDirsFile", type=str, help="Relative or absolute path to file containing a list of directories -- one per sample")
    subparser.add_argument("-f", "--force",   dest="forceFlag", action="store_true", help="Force processing even when result files already exist and are newer than inputs")
    subparser.add_argument("-n", "--metrics", dest="metricsFileName", type=str, default="metrics", metavar="NAME", help="File name of the metrics files which must exist in each of the sample directories.")
    subparser.add_argument("-o", "--output",  dest="mergedMetricsFile", type=str, default="metrics.tsv", metavar="FILE", help="Output file. Relative or absolute path to the combined metrics file.")
    subparser.add_argument("-s", "--spaces",  dest="spaceHeadings", action="store_true", help="Emit column headings with spaces instead of underscores")
    subparser.add_argument("-v", "--verbose", dest="verbose",   type=int, default=1, metavar="0..5", help="Verbose message level (0=no info, 5=lots)")
    subparser.add_argument("--version", action="version", version="%(prog)s version " + __version__)
    subparser.set_defaults(func=combine_metrics.combine_metrics)
    subparser.set_defaults(excepthook=utils.handle_global_exception)

    # -------------------------------------------------------------------------
    # parse the args
    # -------------------------------------------------------------------------
    args = parser.parse_args(argv)

    # Special validation
    if args.subparser_name == "filter_regions":
        if len(args.windowSizeList) != len(args.maxSnpsList):
            utils.global_error("Error: you must specify the same number of arguments for window size and max snps.")

        for window_size in args.windowSizeList:
            if window_size < 1:
                utils.global_error("Error: the length of the window must be a positive integer, and the input is %d." % window_size)

        for max_snps in args.maxSnpsList:
            if max_snps < 1:
                utils.global_error("Error: the maximum number of SNPs allowed must be a positive integer, and the input is %d." % max_snps)

        if (args.edgeLength < 1):
            utils.global_error("Error: the length of the edge regions must be a positive integer, and the input is %d." % args.edgeLength)

    return args
Ejemplo n.º 17
0
def create_snp_matrix(options_dict):
    """Create SNP matrix

    Description:
    Create the SNP matrix containing the consensus base for each of the samples
    at the positions where SNPs were found in any of the samples.  The matrix
    contains one row per sample and one column per SNP position.  Non-SNP
    positions are not included in the matrix.
    This function expects, or creates '(*)', the following
        files arranged in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/consensus.fasta
                ...
            snpma.fasta (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The consensus.fasta input files are previously called consensus
           for each sample to construct the SNP matrix fasta file.
        3. The snpma.fasta output file contains the SNP calls for each
           sequence, arranged as a multi-fasta file with one sequence per
           sample.

    The sampleDirectories.txt, and consensus.fasta are created outside of this
        function. The package documentation provides an example of creating
        these files based on the lambda_virus sequence that is used as one
        test for this package.

    Args:
        sampleDirsFile : str
            File path (not just file name) of file containing paths
            to directories containing consensus.fasta file for each sequence.
        snpListFile : str
            File path (not just file name) of text format list of SNP positions
        consFileName : str
            File name of the previously called consensus fasta files which must
            exist in each of the sample directories
        snpmaFile : str
            File path (not just file name) of the output snp matrix, formatted
            as a fasta file, with each sequence (all of identical length)
            corresponding to the SNPs in the correspondingly named sequence.

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'consFileName':'consensus.fasta',
                    'snpmaFile':'snpma.fasta',
                    'minConsFreq':0.6,
                   }
    create_snp_matrix(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_filename = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_filename, "r") as sample_directories_list_file:
        list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    list_of_sample_directories = sorted([d for d in list_of_sample_directories if d])

    #==========================================================================
    # Verify input consensus.fasta files exist
    #==========================================================================
    consensus_files = []
    bad_file_count = 0
    for sample_directory in list_of_sample_directories:
        consensus_file_path = os.path.join(sample_directory, options_dict['consFileName'])
        bad_count = utils.verify_non_empty_input_files("Consensus fasta file", [consensus_file_path])
        if bad_count == 1:
            bad_file_count += 1
        else:
            consensus_files.append(consensus_file_path)  # keep the list of good files

    if bad_file_count == len(list_of_sample_directories):
        utils.global_error("Error: all %d consensus fasta files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d consensus fasta files were missing or empty." % bad_file_count, continue_possible=True)

    #==========================================================================
    # Check if the result is already fresh
    #==========================================================================
    snpma_file_path = options_dict['snpmaFile']
    source_files = consensus_files
    if not options_dict['forceFlag']:
        if not utils.target_needs_rebuild(source_files, snpma_file_path):
            verbose_print("SNP matrix %s has already been freshly built.  Use the -f option to force a rebuild." % snpma_file_path)
            verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
            return

    #==========================================================================
    #   Create snp matrix. Write results to file.
    #==========================================================================
    with open(snpma_file_path, "w") as output_file:
        for consensus_file_path in consensus_files:
            verbose_print("Merging " + consensus_file_path)
            with open(consensus_file_path, "r") as input_file:
                for line in input_file:
                    output_file.write(line)

    verbose_print("")
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 18
0
def create_snp_list(options_dict):
    """Create SNP list file

    Description:
    Create the SNP list -- the list of positions where variants were found
    and the corresponding list of samples having a variant at each position. 
    This function expects, or creates '(*)', the following files arranged 
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                ...
            snplist.txt (*)

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to 
           the sample directories.
        2. The var.flt.vcf variant input files are used to construct the 
           SNP position list.
        3. The snplist.txt output file contains the union of the SNP positions 
           and sample names extracted from all the var.flt.vcf files.

    The sampleDirectories.txt and var.flt.vcf files are created outside of 
    this function. The package documentation provides an example of creating 
    these files based on the lambda_virus sequence that is used as one test 
    for this package.

    Args:
        sampleDirsFile: File path (not just file name) of file containing paths 
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        snpListFile: File path (not just file name) of text format list 
            of SNP positions

    Raises:

    Examples:
    options_dict = {'sampleDirsFile':'sampleDirectories.txt',
                    'vcfFileName':'var.flt.vcf'
                    'snpListFile':'snplist.txt',
                   }
    create_snp_list(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_filename = options_dict['sampleDirsFile']
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_filename, "r") as sample_directories_list_file:
        list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    list_of_sample_directories = sorted([d for d in list_of_sample_directories if d])

    #==========================================================================
    # Read in all vcf files and process into dict of SNPs passing various
    # criteria. Do this for each sample. Write to file.
    #==========================================================================
    snp_list_file_path = options_dict['snpListFile']
    vcf_file_name = options_dict['vcfFileName']
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in list_of_sample_directories]

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path):
        snp_dict = utils.convert_vcf_files_to_snp_dict(list_of_vcf_files)
        verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files)))
        utils.write_list_of_snps(snp_list_file_path, snp_dict)
        verbose_print("")
    else:
        verbose_print("SNP list %s has already been freshly built.  Use the -f option to force a rebuild." % snp_list_file_path)
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 19
0
def calculate_snp_distances(options_dict):
    """Calculate pairwise sample SNP distances.

    Description:
    Calculate pairwise SNP distances from the multi-fasta SNP matrix.
    Generate a file of pairwise distances and a file containing a matrix
    of distances.
    This function expects, or creates '(*)', the following files:
            snpma.fasta
            snp_distance_pairwise.tsv*
            snp_distance_matrix.tsv*

    The files are used as follows:
        1. The snpma.fasta input file contains the snp matrix for all samples
        2. The snp_distance_pairwise.tsv output file contains a three column
            tab-separated table of distances between all pairs of samples
        2. The snp_distance_matrix.tsv output file contains a matrix of
           distances between all samples.

    Args:
        inputFile: File path (not just file name) for the snp matrix in fasta format
        pairwiseFile: File path (not just file name) of the output pairwise distance file
        matrixFile: File path (not just file name) for the output distance matrix file

    Raises:

    Examples:
    options_dict = {'inputFile':'snpma.fasta',
                    'pairwiseFile':'snp_distance_pairwise.tsv',
                    'matrixFile':'snp_distance_matrix.tsv'
                   }
    calculate_snp_distances(options_dict)
    """
    print_log_header()
    verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short()))
    verbose_print("# %s version %s" % (utils.program_name(), __version__))
    print_arguments(options_dict)

    #==========================================================================
    # Validate arguments
    #==========================================================================
    input_file = options_dict['inputFile']
    pairwise_file = options_dict['pairwiseFile']
    matrix_file = options_dict['matrixFile']
    force_flag = options_dict['forceFlag']

    bad_file_count = utils.verify_existing_input_files("SNP matrix file", [input_file])
    if bad_file_count > 0:
        utils.global_error("Error: cannot calculate sequence distances without the snp matrix file.")

    if not pairwise_file and not matrix_file:
        utils.global_error("Error: no output file specified.")

    #==========================================================================
    # Check freshness
    #==========================================================================
    rebuild_pairwise_file = pairwise_file and utils.target_needs_rebuild([input_file], pairwise_file)
    rebuild_matrix_file = matrix_file and utils.target_needs_rebuild([input_file], matrix_file)
    if force_flag or rebuild_pairwise_file or rebuild_matrix_file:

        #------------------------------
        # Read in snp matrix file
        #------------------------------
        seqs = {}
        with open(input_file) as ifile:
            for line in ifile:
                line = line.rstrip('\n')
                if line.startswith('>'):
                    curr_sample = line.lstrip('>')
                    seqs[curr_sample] = ''
                else:
                    seqs[curr_sample] += str(line)

        #------------------------------
        # Count mismatches
        #------------------------------
        ids = sorted(seqs.keys())
        pairwise_mismatches = dict() # tuple (seq1 id, seq2 id) -> int

        for id1, id2 in itertools.combinations(ids, 2):
            mismatches = utils.calculate_sequence_distance(seqs[id1], seqs[id2])
            pairwise_mismatches[(id1, id2)] = mismatches
            pairwise_mismatches[(id2, id1)] = mismatches

        #------------------------------
        # Print distance files
        #------------------------------
        if pairwise_file:
            with open(pairwise_file, 'w') as p_out:
                p_out.write('%s\n' % '\t'.join(['Seq1', 'Seq2', 'Distance']))
                for id1, id2 in itertools.product(ids, ids):
                    mismatches = pairwise_mismatches.get((id1, id2), 0) # zero when id1=id2
                    p_out.write("%s\t%s\t%i\n" % (id1, id2, mismatches))

        if matrix_file:
            with open(matrix_file, 'w') as m_out:
                m_out.write('\t%s\n' % '\t'.join(ids)) # matrix header
                # write table of mismatches
                for id1 in ids:
                    mismatches = [pairwise_mismatches.get((id1, id2), 0) for id2 in ids]
                    mismatch_strs = map(str, mismatches)
                    m_out.write("%s\t%s\n" % (id1, '\t'.join(mismatch_strs)))

    else:
        verbose_print("Distance files have already been freshly built.  Use the -f option to force a rebuild.")
    verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
Ejemplo n.º 20
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.
        acrossSamples: Dense regions found in any sample are filtered from all samples.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Get arguments from Argparse namespace
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    ref_fasta_path = args.refFastaFile
    force_flag = args.forceFlag
    vcf_file_name = args.vcfFileName
    edge_length = args.edgeLength
    window_size_list = args.windowSizeList
    max_num_snps_list = args.maxSnpsList
    out_group_list_path = args.outGroupFile
    filter_across_samples = args.acrossSamples

    #==========================================================================
    # Validate inputs
    #==========================================================================
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]
    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [ref_fasta_path])
    if bad_file_count > 0:
        utils.global_error(None)

    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            # There are outgroup samples
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file]
            sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples)
        except:
            utils.global_error("Error: Cannot open the file containing the list of outgroup samples!")

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(ref_fasta_path, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            # build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
    except:
        utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.")
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Filter regions
    #==========================================================================
    if filter_across_samples:
        filter_regions_across_samples(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
    else:
        filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path)
Ejemplo n.º 21
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*

    The reverse fastq file is optional.
    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error("Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print("# %s has already been aligned to %s.  Use the -f option to force a rebuild." % (sample_id, reference_id))
        return

    #==========================================================================
    # Construct the command line to execute bowtie2 or smalt
    #==========================================================================

    # The read group identifies reads from a single run and lane
    read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id)

    # Default to 8 cores on HPC or all cpu cores on workstation
    if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"):
        num_cores = 8
    else:
        num_cores = psutil.cpu_count()

    num_cores_param = ""

    if snp_pipeline_aligner == "bowtie2":
        version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

        # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores
        bowtie2_align_extra_params = os.environ.get("Bowtie2Align_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(bowtie2_align_extra_params, "-p"):
            num_cores_param = "-p " + str(num_cores)

        # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
        # The read group tags are used by some downstream tools, like Picard and GATK.
        read_group_params = ""
        if read_group_tags:
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

        # Substitute the default parameters if the user did not specify bowtie parameters
        bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q"

        # Build the command with options depending on whether the fastq files are paired
        command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path
        if sample_fastq_file2:
            command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
        else:
            command_line += " -U " + sample_fastq_file1

    elif snp_pipeline_aligner == "smalt":
        version_str = utils.extract_version_str("smalt", "smalt version")

        # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores
        smalt_align_extra_params = os.environ.get("SmaltAlign_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(smalt_align_extra_params, "-n"):
            num_cores_param = "-n " + str(num_cores)

        # Substitute the default parameters if the user did not specify smalt parameters
        smalt_align_params = smalt_align_extra_params or "-O"

        # Don't use the -i 1000 option if the fastq file is unpaired
        if not sample_fastq_file2:
            smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute

        command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "")

    #==========================================================================
    # Run the command to execute bowtie2 or smalt
    #==========================================================================
    verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id))
    verbose_print("# %s %s" % (utils.timestamp(), command_line))
    verbose_print("# %s" % version_str)
    command.run(command_line, sam_file)

    #==========================================================================
    # When using smalt, assign read groups in a separate step.
    # This is already done when using bowtie2.
    #==========================================================================
    if snp_pipeline_aligner == "smalt" and read_group_tags:
        smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
        shutil.move(sam_file, smalt_sam_file)
        version_str = utils.extract_version_str("Picard", "java  picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1")
        jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
        command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups"
        command_line += " I=" + smalt_sam_file
        command_line += " O=" + sam_file
        command_line += " RGID=" + read_group_tags.ID
        command_line += " RGSM=" + read_group_tags.SM
        command_line += " RGLB=" + read_group_tags.LB
        command_line += " RGPL=" + read_group_tags.PL
        command_line += " RGPU=" + read_group_tags.PU
        verbose_print("")
        verbose_print("# Assign read group id %s" % (read_group_tags.ID))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
def create_snp_reference_seq(args):
    """Write reference sequence bases at SNP locations to a fasta file.

    Write reference sequence bases at SNP locations to a fasta file.
    This function expects, or creates '(*)', the following files:
            reference.fasta
            snplist.txt
            referenceSNP.fasta (*)

    The files are used as follows:
        1. The reference.fasta input file contains the whole-genome reference
           bases.
        2. The snplist.txt input file contains the list of SNP positions across
           all the samples.
        2. The referenceSNP.fasta output file contains the reference bases at
           the identified SNP locations.

    The snplist.txt file is created outside of this function.  The package
        documentation provides an example of creating this file based on the
        lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : Namespace
        referenceFile: File path (not just file name) for reference sequence in fasta format
        snpListFile: File path (not just file name) of text format list of SNP positions
        snpRefFile: File path (not just file name) for the SNP reference sequence file.

    Raises:

    Examples:
    args = argparse.Namespace
    args.referenceFile = 'reference.fasta'
    args.snpListFile = 'snplist.txt'
    args.snpRefFile = 'referenceSNP.fasta'
    create_snp_reference_seq(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    #    Write reference sequence bases at SNP locations to a fasta file.
    #==========================================================================
    reference_file = args.referenceFile
    snp_list_file_path = args.snpListFile
    snp_ref_seq_path = args.snpRefFile

    #==========================================================================
    # Verify input files exist
    #==========================================================================
    bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path])
    if bad_file_count > 0:
        utils.global_error("Error: cannot create the snp reference sequence without the snplist file.")

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [reference_file])
    if bad_file_count > 0:
        utils.global_error("Error: cannot create the snp reference sequence without the reference fasta file.")

    #==========================================================================
    # Find the reference bases at the snp positions
    #==========================================================================
    source_files = [reference_file, snp_list_file_path]
    if args.forceFlag or utils.target_needs_rebuild(source_files, snp_ref_seq_path):
        utils.write_reference_snp_file(reference_file, snp_list_file_path, snp_ref_seq_path)
    else:
        verbose_print("SNP reference sequence %s has already been freshly built.  Use the -f option to force a rebuild." % snp_ref_seq_path)
Ejemplo n.º 23
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads",
                                            "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".deduped",
                                           enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".indelrealigned",
                                           enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan", "java -jar " + jar_file_path +
                " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
Ejemplo n.º 24
0
def filter_regions(args):
    """Remove bad SNPs from original vcf files

    Remove bad SNPs -- this function finds bad regions, including the edges
    and probable prophage regions; then remove SNPs in these regions in
    original vcf files of all samples.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            sampleDirectories.txt
            samples
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_removed.vcf (*)
                sample_name_one/var.flt_preserved.vcf (*)
                ...

    The files are used as follows:
        1. The sampleDirectories.txt input file contains a list of the paths to
           the sample directories.
        2. The var.flt.vcf variant input files (i.e., the original vcf file).
        3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and
           preserved SNPs.

    The sampleDirectories.txt and var.flt.vcf files are created outside of
    this function. The package documentation provides an example of creating
    these files based on the lambda_virus sequence that is used as one test
    for this package.

    Parameters
    ----------
    Args:
        sampleDirsFile: File path (not just file name) of file containing paths
            to directories containing var.flt.vcf file for each sequence.
        vcfFileName: File name of the VCF files which must exist in each of the
            sample directories
        refFastaFile: File path (not just file name) of reference fasta file
        edgeLength: the length of edge of a contig in which SNPs will be removed.
            Default is 500.
        windowSize: the size of the window in which max number of SNPs are allowed.
            Default is 1000.
        maxSNP: the maximum number of SNPs allowed in a window of a size defined in
            windowSize. Default is 3.

    Raises:

    Examples:
    args = argparse.Namespace
    args.sampleDirsFile = 'sampleDirectories.txt'
    args.vcfFileName = 'var.flt.vcf'
    args.refFastaFile = 'snplist.txt'
    remove_bad_snp(args)
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate some parameters
    #==========================================================================
    edge_length = args.edgeLength
    window_size = args.windowSize
    max_num_snp = args.maxSNP

    #==========================================================================
    # Prep work
    #==========================================================================
    sample_directories_list_path = args.sampleDirsFile
    bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path])
    if bad_file_count > 0:
        utils.global_error(None)

    with open(sample_directories_list_path, "r") as sample_directories_list_file:
        unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file]
    unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d]
    sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories)

    input_file_list = list()
    out_group_list_path = args.outGroupFile
    sorted_list_of_outgroup_samples = list()
    if out_group_list_path is not None:
        bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path])
        if bad_file_count > 0:
            utils.global_error(None)
        try:
            #There are outgroup samples
            input_file_list.append(out_group_list_path)
            with open(out_group_list_path, "r") as out_group_list_file:
                unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file]
            sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples)
        except:
            utils.global_error("Error: Cannot open the file containing the list of outgroup samples!")

    #==========================================================================
    # Validate inputs
    #==========================================================================
    vcf_file_name = args.vcfFileName
    list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories]
    input_file_list.extend(list_of_vcf_files)

    bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files)
    if bad_file_count == len(list_of_vcf_files):
        utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count)
    elif bad_file_count > 0:
        utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True)

    bad_file_count = utils.verify_non_empty_input_files("Reference file", [args.refFastaFile])
    if bad_file_count > 0:
        utils.global_error(None)

    #==========================================================================
    # Get contigs' length from the reference fasta file
    #==========================================================================
    try:
        handle = open(args.refFastaFile, "r")
        contig_length_dict = dict()
        for record in SeqIO.parse(handle, "fasta"):
            #build contig_length_dict
            contig_length_dict[record.id] = len(record.seq)
        input_file_list.append(args.refFastaFile)
    except:
        utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.")
    else:
        if handle:
            handle.close()

    #==========================================================================
    # Which samples need rebuild?
    #
    # Any changed or new input file will trigger rebuild for all samples because
    # the bad regions are combined across all samples.  However, a missing
    # output file will only cause rebuild of the missing file.
    #==========================================================================
    need_rebuild_dict = dict()
    for vcf_file_path in list_of_vcf_files:
        preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
        removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"
        preserved_needs_rebuild = utils.target_needs_rebuild(input_file_list, preserved_vcf_file_path)
        removed_needs_rebuild = utils.target_needs_rebuild(input_file_list, removed_vcf_file_path)
        need_rebuild_dict[vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild

    if not any(need_rebuild_dict.values()):
        utils.verbose_print("All preserved and removed vcf files are already freshly built.  Use the -f option to force a rebuild.")
        return

    #==========================================================================
    # Find all bad regions.
    #==========================================================================
    bad_regions_dict = dict() # Key is the contig ID, and the value is a list of bad regions.
    for vcf_file_path in list_of_vcf_files:
        try:
            vcf_reader_handle = open(vcf_file_path, 'r')
            vcf_reader = vcf.Reader(vcf_reader_handle)
        except:
            utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
            continue

        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID in sorted_list_of_outgroup_samples:
            if not need_rebuild_dict[vcf_file_path]:
                vcf_reader_handle.close()
                continue
            #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
                continue

            vcf_writer_removed.close()
            vcf_reader_handle.close()
            shutil.copyfile(vcf_file_path, preserved_vcf_file_path)
        else:
            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.
            snp_dict = defaultdict(list)
            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                record = (vcf_data_line.POS, vcf_data_line)
                snp_dict[vcf_data_line.CHROM].append(record)

            #Find bad regions and add them into bad_region
            for contig, snp_list in snp_dict.items():

                #sort all SNPs in this contig by position
                sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0])

                #total number of SNPs
                num_of_snp = len(sorted_list)

                if contig not in bad_regions_dict:
                    #New contig
                    try:
                        contig_length = contig_length_dict[contig]
                    except:
                        #cannot find contig length. Use the sys.maxsize.
                        contig_length = sys.maxsize

                    if (contig_length <= (edge_length * 2)):
                        bad_regions_dict[contig] = [(0, contig_length)]
                    else:
                        region = [(0, edge_length), (contig_length - edge_length, contig_length)]
                        bad_regions_dict[contig] = region

                #Process SNPs
                for idx, snp in enumerate(sorted_list):
                    if (idx + max_num_snp) < num_of_snp:
                        pos_start = snp[0]
                        pos_end = sorted_list[idx + max_num_snp][0]
                        if (pos_start + window_size) >= pos_end:
                            #Add bad region
                            regions = bad_regions_dict[contig]
                            temp_region = (pos_start, pos_end)
                            regions.append(temp_region)
        vcf_reader_handle.close()

    #Combine all bad regions for each contig
    for contig, regions in bad_regions_dict.items():
        sorted_regions = utils.sort_coord(regions)
        combined_regions = utils.consensus(sorted_regions)
        bad_regions_dict[contig] = combined_regions

    #Scan vcf files to remove SNPs
    for vcf_file_path in list_of_vcf_files:
        if not need_rebuild_dict[vcf_file_path]:
            continue
        #Get sample ID
        ss = vcf_file_path.split('/')
        sample_ID = ss[-2]

        if sample_ID not in sorted_list_of_outgroup_samples:
            try:
                vcf_reader_handle = open(vcf_file_path, 'r')
                vcf_reader = vcf.Reader(vcf_reader_handle)
            except:
                utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True)
                continue

            #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF.

            preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf"
            removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf"

            try:
                vcf_writer_preserved = None
                vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader)
            except:
                if vcf_writer_preserved is not None:
                    vcf_writer_preserved.close()
                os.remove(preserved_vcf_file_path)
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True)
                continue

            try:
                vcf_writer_removed = None
                vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader)
            except:
                #close vcf_writer_reserved and remove the file reserved_vcf_file_path
                if vcf_writer_removed is not None:
                    vcf_writer_removed.close()
                os.remove(removed_vcf_file_path)
                vcf_writer_preserved.close()
                vcf_reader_handle.close()
                utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True)
                continue

            for vcf_data_line in vcf_reader:
                #Create a dict to store all SNPs in this sample
                #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output.
                contig = vcf_data_line.CHROM
                if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]):
                    #Remove this SNP
                    vcf_writer_removed.write_record(vcf_data_line)
                else:
                    #Preserve this SNP
                    vcf_writer_preserved.write_record(vcf_data_line)

            vcf_writer_preserved.close()
            vcf_writer_removed.close()
            vcf_reader_handle.close()
Ejemplo n.º 25
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file. The sample alignment is sorted, duplicate reads
    are marked, and reads realigned around indels.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.sorted.deduped.bai*
                sample_name_one/realign.target.intervals*
                sample_name_one/reads.sorted.deduped.indelrealigned.bam*

    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file",
                                       fastq_files,
                                       error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    num_threads = args.threads

    #==========================================================================
    # verify jar files are in CLASSPATH
    #==========================================================================
    picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
    if not picard_jar_file_path:
        utils.global_error(
            "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable."
        )
    picard_version_str = utils.extract_version_str(
        "Picard", "java -jar " + picard_jar_file_path +
        " AddOrReplaceReadGroups --version 2>&1")

    gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK",
                                                      "CLASSPATH")
    if not gatk_jar_file_path:
        utils.global_error(
            "Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable."
        )
    gatk_version_str = utils.extract_version_str(
        "GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1")

    #==========================================================================
    # Enforce the proper SAMtools version
    #==========================================================================

    samtools_version_str = utils.extract_version_str(
        "SAMtools", "samtools 2>&1 > /dev/null")
    samtools_version = samtools_version_str.split()[-1]  # just the number
    if samtools_version < "1.4":
        utils.global_error(
            "The installed %s is not supported.  Version 1.4 or higher is required."
            % samtools_version_str)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# %s has already been aligned to %s.  Use the -f option to force a rebuild."
            % (sample_id, reference_id))
    else:
        #==========================================================================
        # Construct the command line to execute bowtie2 or smalt
        #==========================================================================

        # The read group identifies reads from a single run and lane
        read_group_tags = fastq.construct_read_group_tags(
            sample_fastq_file1, sample_id)

        # Make up dummy read group tags if the read group information is missing from the fastq files.
        # GATK components require these tags.
        if read_group_tags is None:
            id = "1"
            sm = sample_id
            lb = "1"
            pl = None
            pu = sample_id
            read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu)

        if snp_pipeline_aligner == "bowtie2":
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")

            # Substitute the default parameters if the user did not specify bowtie parameters
            os.environ["Bowtie2Align_ExtraParams"] = os.environ.get(
                "Bowtie2Align_ExtraParams") or "--reorder"

            # Set the number of threads to use
            utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p",
                                            num_threads, None)
            bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"]

            # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
            # The read group tags are used by some downstream tools, like Picard and GATK.
            read_group_params = ""
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            if read_group_tags.PL is not None:
                read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

            # Build the command with options depending on whether the fastq files are paired
            command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path
            if sample_fastq_file2:
                command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
            else:
                command_line += " -U " + sample_fastq_file1

        elif snp_pipeline_aligner == "smalt":
            version_str = utils.extract_version_str("smalt", "smalt version")

            # Substitute the default parameters if the user did not specify smalt parameters
            os.environ["SmaltAlign_ExtraParams"] = os.environ.get(
                "SmaltAlign_ExtraParams") or "-O"

            # Set the number of threads to use
            utils.configure_process_threads("SmaltAlign_ExtraParams", "-n",
                                            num_threads, None)
            smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"]

            # Don't use the -i 1000 option if the fastq file is unpaired
            if not sample_fastq_file2:
                smalt_align_params = re.sub(
                    "-i[ ]+[0-9]+", '',
                    smalt_align_extra_params)  # regex substitute

            command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (
                sample_fastq_file2 or "")

        #==========================================================================
        # Run the command to execute bowtie2 or smalt
        #==========================================================================
        verbose_print("# Align sequence %s to reference %s" %
                      (sample_id, reference_id))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sam_file)

        #==========================================================================
        # When using smalt, assign read groups in a separate step.
        # This is already done when using bowtie2.
        #==========================================================================
        if snp_pipeline_aligner == "smalt" and read_group_tags:
            smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
            shutil.move(sam_file, smalt_sam_file)
            jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
            command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups"
            command_line += " I=" + smalt_sam_file
            command_line += " O=" + sam_file
            command_line += " RGID=" + read_group_tags.ID
            command_line += " RGSM=" + read_group_tags.SM
            command_line += " RGLB=" + read_group_tags.LB
            if read_group_tags.PL is None:
                command_line += " RGPL=unknown"  # Picard requires this command line option
            else:
                command_line += " RGPL=" + read_group_tags.PL
            command_line += " RGPU=" + read_group_tags.PU
            verbose_print("")
            verbose_print("# Assign read group id %s" % (read_group_tags.ID))
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
        verbose_print("")

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        # Substitute the default parameters if the user did not specify samtools view parameters
        os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get(
            "SamtoolsSamFilter_ExtraParams") or "-F 4"

        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSamFilter_ExtraParams",
                                        ["-@", "--threads"], num_threads, None)
        samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"]

        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print(
            "# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file],
                                               sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSort_ExtraParams",
                                        ["-@", "--threads"], num_threads, None)
        samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"]

        command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file
        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads",
                                            "true").lower() == "true"
    input_file = sorted_bam_file
    output_file = utils.add_file_suffix(input_file,
                                        ".deduped",
                                        enable=remove_duplicate_reads)
    if remove_duplicate_reads:
        # Check for fresh deduped bam file; if not, remove duplicate reads
        needs_rebuild = utils.target_needs_rebuild([input_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            picard_jvm_extra_params = os.environ.get(
                "PicardJvm_ExtraParams") or ""
            picard_mark_duplicates_extra_params = os.environ.get(
                "PicardMarkDuplicates_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join(
                sample_dir, "duplicate_reads_metrics.txt"
            ) + tmp_option + ' ' + picard_mark_duplicates_extra_params
            verbose_print("# Mark duplicate reads in bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file,
                                               "picard MarkDuplicates")
            verbose_print("")

    #==========================================================================
    # Next three steps are part of local realignment around indels
    #==========================================================================
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"

    #==========================================================================
    # Index the sorted bam file prior to RealignerTargetCreator
    #==========================================================================

    input_file = output_file  # output from last step becomes input to this step
    if enable_local_realignment:
        # Check for fresh bai file; if not, index it
        bam_index_file = input_file[:-3] + "bai"
        needs_rebuild = utils.target_needs_rebuild([input_file],
                                                   bam_index_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bam file index is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            # Set the number of threads to use
            utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@",
                                            num_threads, None)
            samtools_index_extra_params = os.environ[
                "SamtoolsIndex_ExtraParams"]

            command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file
            verbose_print("# Index bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % samtools_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(bam_index_file,
                                               "samtools index")
            verbose_print("")

    #==========================================================================
    # Identify targets for realignment
    #==========================================================================

    if enable_local_realignment:
        # Check for fresh realign_targets_file file; if not run RealignerTargetCreator
        realign_targets_file = os.path.join(sample_dir,
                                            "realign.target.intervals")
        needs_rebuild = utils.target_needs_rebuild(
            [input_file, bam_index_file], realign_targets_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Realign targets file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            # Set the number of threads to use
            utils.configure_process_threads(
                "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"],
                num_threads, None)
            realigner_target_creator_extra_params = os.environ[
                "RealignerTargetCreator_ExtraParams"]

            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file + ' ' + realigner_target_creator_extra_params
            verbose_print("# Identify targets for realignment.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(realign_targets_file,
                                               "GATK RealignerTargetCreator",
                                               empty_ok=True)
            verbose_print("")

    #==========================================================================
    # Realign around indels
    #==========================================================================

    output_file = utils.add_file_suffix(input_file,
                                        ".indelrealigned",
                                        enable=enable_local_realignment)
    if enable_local_realignment:
        # Check for fresh indelrealigned bam file; if not run IndelRealigner
        needs_rebuild = utils.target_needs_rebuild(
            [input_file, bam_index_file, realign_targets_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Indelrealigned bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            indel_realigner_extra_params = os.environ.get(
                "IndelRealigner_ExtraParams") or ""
            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file + ' ' + indel_realigner_extra_params
            verbose_print("# Realign around indels")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file,
                                               "GATK IndelRealigner")
Ejemplo n.º 26
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file. The sample alignment is sorted, duplicate reads
    are marked, and reads realigned around indels.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.sorted.deduped.bai*
                sample_name_one/realign.target.intervals*
                sample_name_one/reads.sorted.deduped.indelrealigned.bam*

    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error("Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    num_threads = args.threads

    #==========================================================================
    # verify jar files are in CLASSPATH
    #==========================================================================
    picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
    if not picard_jar_file_path:
        utils.global_error("Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable.")
    picard_version_str = utils.extract_version_str("Picard", "java -jar " + picard_jar_file_path + " AddOrReplaceReadGroups --version 2>&1")

    gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH")
    if not gatk_jar_file_path:
        utils.global_error("Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable.")
    gatk_version_str = utils.extract_version_str("GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1")

    #==========================================================================
    # Enforce the proper SAMtools version
    #==========================================================================

    samtools_version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
    samtools_version = samtools_version_str.split()[-1] # just the number
    if samtools_version < "1.4":
        utils.global_error("The installed %s is not supported.  Version 1.4 or higher is required." % samtools_version_str)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print("# %s has already been aligned to %s.  Use the -f option to force a rebuild." % (sample_id, reference_id))
    else:
        #==========================================================================
        # Construct the command line to execute bowtie2 or smalt
        #==========================================================================

        # The read group identifies reads from a single run and lane
        read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id)

        # Make up dummy read group tags if the read group information is missing from the fastq files.
        # GATK components require these tags.
        if read_group_tags is None:
            id = "1"
            sm = sample_id
            lb = "1"
            pl = None
            pu = sample_id
            read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu)

        if snp_pipeline_aligner == "bowtie2":
            version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

            # Substitute the default parameters if the user did not specify bowtie parameters
            os.environ["Bowtie2Align_ExtraParams"] = os.environ.get("Bowtie2Align_ExtraParams") or "--reorder"

            # Set the number of threads to use
            utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p", num_threads, None)
            bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"]

            # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
            # The read group tags are used by some downstream tools, like Picard and GATK.
            read_group_params = ""
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            if read_group_tags.PL is not None:
                read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

            # Build the command with options depending on whether the fastq files are paired
            command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path
            if sample_fastq_file2:
                command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
            else:
                command_line += " -U " + sample_fastq_file1

        elif snp_pipeline_aligner == "smalt":
            version_str = utils.extract_version_str("smalt", "smalt version")

            # Substitute the default parameters if the user did not specify smalt parameters
            os.environ["SmaltAlign_ExtraParams"] = os.environ.get("SmaltAlign_ExtraParams") or  "-O"

            # Set the number of threads to use
            utils.configure_process_threads("SmaltAlign_ExtraParams", "-n", num_threads, None)
            smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"]

            # Don't use the -i 1000 option if the fastq file is unpaired
            if not sample_fastq_file2:
                smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute

            command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "")

        #==========================================================================
        # Run the command to execute bowtie2 or smalt
        #==========================================================================
        verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sam_file)

        #==========================================================================
        # When using smalt, assign read groups in a separate step.
        # This is already done when using bowtie2.
        #==========================================================================
        if snp_pipeline_aligner == "smalt" and read_group_tags:
            smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
            shutil.move(sam_file, smalt_sam_file)
            jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
            command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups"
            command_line += " I=" + smalt_sam_file
            command_line += " O=" + sam_file
            command_line += " RGID=" + read_group_tags.ID
            command_line += " RGSM=" + read_group_tags.SM
            command_line += " RGLB=" + read_group_tags.LB
            if read_group_tags.PL is None:
                command_line += " RGPL=unknown"  # Picard requires this command line option
            else:
                command_line += " RGPL=" + read_group_tags.PL
            command_line += " RGPU=" + read_group_tags.PU
            verbose_print("")
            verbose_print("# Assign read group id %s" % (read_group_tags.ID))
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
        verbose_print("")

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        # Substitute the default parameters if the user did not specify samtools view parameters
        os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4"

        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], num_threads, None)
        samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"]

        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print("# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSort_ExtraParams", ["-@", "--threads"], num_threads, None)
        samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"]

        command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file
        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true"
    input_file = sorted_bam_file
    output_file = utils.add_file_suffix(input_file, ".deduped", enable=remove_duplicate_reads)
    if remove_duplicate_reads:
        # Check for fresh deduped bam file; if not, remove duplicate reads
        needs_rebuild = utils.target_needs_rebuild([input_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or ""
            picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params
            verbose_print("# Mark duplicate reads in bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file, "picard MarkDuplicates")
            verbose_print("")

    #==========================================================================
    # Next three steps are part of local realignment around indels
    #==========================================================================
    enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true"

    #==========================================================================
    # Index the sorted bam file prior to RealignerTargetCreator
    #==========================================================================

    input_file = output_file # output from last step becomes input to this step
    if enable_local_realignment:
        # Check for fresh bai file; if not, index it
        bam_index_file = input_file[:-3] + "bai"
        needs_rebuild = utils.target_needs_rebuild([input_file], bam_index_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Bam file index is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            # Set the number of threads to use
            utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@", num_threads, None)
            samtools_index_extra_params = os.environ["SamtoolsIndex_ExtraParams"]

            command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file
            verbose_print("# Index bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % samtools_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(bam_index_file, "samtools index")
            verbose_print("")


    #==========================================================================
    # Identify targets for realignment
    #==========================================================================

    if enable_local_realignment:
        # Check for fresh realign_targets_file file; if not run RealignerTargetCreator
        realign_targets_file = os.path.join(sample_dir, "realign.target.intervals")
        needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file], realign_targets_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Realign targets file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            # Set the number of threads to use
            utils.configure_process_threads("RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], num_threads, None)
            realigner_target_creator_extra_params= os.environ["RealignerTargetCreator_ExtraParams"]

            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file  + ' ' + realigner_target_creator_extra_params
            verbose_print("# Identify targets for realignment.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(realign_targets_file, "GATK RealignerTargetCreator", empty_ok=True)
            verbose_print("")

    #==========================================================================
    # Realign around indels
    #==========================================================================

    output_file = utils.add_file_suffix(input_file, ".indelrealigned", enable=enable_local_realignment)
    if enable_local_realignment:
        # Check for fresh indelrealigned bam file; if not run IndelRealigner
        needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file, realign_targets_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Indelrealigned bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            indel_realigner_extra_params = os.environ.get("IndelRealigner_ExtraParams") or ""
            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file  + ' ' + indel_realigner_extra_params
            verbose_print("# Realign around indels")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file, "GATK IndelRealigner")