コード例 #1
0
import cmdrun

cmdrun = cmdrun.cmdrun("test", "log", "sge")
cmdrun.mem = 1
cmdrun.max_parallel = 2
cmdrun.file_timeout = 10

cmdrun.padd(["echo asdf1 | true"], {}, {})
cmdrun.padd(["echo asdf2 | false"], {}, {})
cmdrun.padd(["echo asdf3 | true"], {}, {})
cmdrun.padd(["true"], {}, {})
cmdrun.padd(["touch %(in1)s"], {}, {'in1': "test1"})
cmdrun.padd(["true"], {}, {'in2': "test2"})
#cmdrun.padd("./testcmdrunner2.pl testinner", {}, {})
cmdrun.prun()
コード例 #2
0
def splitalignmerge(reads_end_1_fastq,
                    reads_end_2_fastq,
                    reference_fasta="",
                    output_bam="",
                    working_dir="./",
                    num_job_reads="1000000",
                    aligner=supported_aligners[0],
                    align_type=supported_align_types[0],
                    aligner_bin="",
                    samtools_bin="samtools",
                    fragment_length="500",
                    max_alignments="1",
                    edit_distance="2",
                    split_only=True,
                    remove_split=False):

    reads_end_1_fastq = os.path.abspath(reads_end_1_fastq)
    reads_end_2_fastq = os.path.abspath(reads_end_2_fastq)

    # Only split if no reference or output specified
    split_only = True
    if len(args) == 4:
        reference_fasta = os.path.abspath(reference_fasta)
        output_bam = os.path.abspath(output_bam)
        split_only = False
    elif len(args) != 2:
        parser.error("incorrect number of arguments")

    # Fasta index is required for samtools
    reference_fasta_index = reference_fasta + ".fai"
    if not os.path.exists(reference_fasta_index):
        sys.stderr.write("Error: Required file " + reference_fasta_index +
                         " does not exist\n")
        sys.exit(1)

    # Ensure everything is absolute paths
    options.working_dir = os.path.abspath(options.working_dir)

    # Use the reads basename and reference basename for uniquely naming files
    reads_name = os.path.basename(reads_end_1_fastq)
    reference_name = os.path.basename(reference_fasta)

    name = reads_name + "." + reference_name

    cmdrun = cmdrun.cmdrun(name, options.working_dir, "sge")

    split_prefix = options.working_dir + "/" + reads_name + "." + options.num_job_reads
    split_catalog_filename = split_prefix + ".split.catalog"

    split_fastq_script = os.path.abspath(sys.path[0]) + "/split_fastq.pl"

    cmdrun.run([
        split_fastq_script, "%(fastq1)s", "%(fastq2)s", options.num_job_reads,
        split_prefix, "> %(catalog)s"
    ], {
        'fastq1': reads_end_1_fastq,
        'fastq2': reads_end_2_fastq
    }, {'catalog': split_catalog_filename})

    split_filenames = []
    split_filename_pairs = []
    for split_fastq_info in csv.reader(open(split_catalog_filename, 'r'),
                                       delimiter='\t'):
        split_filenames.append(split_fastq_info[0])
        split_filenames.append(split_fastq_info[1])
        split_filename_pairs.append([split_fastq_info[0], split_fastq_info[1]])

    def align(fastq_end_1, fastq_end_2, bam_prefix, bam_filenames):

        align_command = []

        if options.aligner == "bowtie":
            align_command.extend([
                options.aligner_bin, "--sam-nosq", "-S", "--mm", "-t", "-k",
                options.max_alignments, "-m", options.max_alignments,
                reference_fasta
            ])
            if options.align_type == "paired":
                align_command.extend([
                    "-X", options.fragment_length, "-1", "%(fastq1)s", "-2",
                    "%(fastq2)s"
                ])
            elif options.align_type == "single":
                align_command.extend([
                    "--min", "0", "--max", options.fragment_length, "%(fastq)s"
                ])

        elif options.aligner == "mrsfast":
            align_command.extend([
                options.aligner_bin, "-e", options.edit_distance, "-n",
                options.max_alignments, "--search", reference_fasta
            ])
            if options.align_type == "paired":
                align_command.extend([
                    "--min", "0", "--max", options.fragment_length, "-seq1",
                    "%(fastq1)s", "-seq2", "%(fastq2)s"
                ])
            elif options.align_type == "single":
                align_command.extend(["-seq", "%(fastq)s"])

        bam_sort_prefix = bam_prefix + ".sort"

        align_command.extend([
            "|", options.samtools_bin, "view", "-bt", reference_fasta_index,
            "-", "|", options.samtools_bin, "sort", "-o", "-", bam_sort_prefix,
            ">", "%(bam)s"
        ])

        if options.align_type == "paired":
            bam_filename = bam_prefix + ".bam"
            cmdrun.padd(align_command, {
                'fastq1': fastq_end_1,
                'fastq2': fastq_end_2
            }, {'bam': bam_filename})
            bam_filenames.append(bam_filename)

        elif options.align_type == "single":
            bam_filename = bam_prefix + ".1.bam"
            bam_filename = bam_prefix + ".2.bam"
            cmdrun.padd(align_command, {'fastq': fastq_end_1},
                        {'bam': bam_filename})
            cmdrun.padd(align_command, {'fastq': fastq_end_2},
                        {'bam': bam_filename})
            bam_filenames.append(bam_filename_1)
            bam_filenames.append(bam_filename_2)

    def remove_files(filenames):
        for filename in filenames:
            if os.path.exists(filename):
                os.remove(filename)

    if split_only:
        if options.remove_split:
            remove_files(split_filenames)
            os.remove(split_catalog_filename)
        sys.exit(0)

    # Create bam files
    bam_filenames = []
    for split_filename_pair in split_filename_pairs:
        reads_end_1_split_fastq = split_filename_pair[0]
        reads_end_2_split_fastq = split_filename_pair[1]
        split_name = os.path.basename(reads_end_1_split_fastq)
        bam_prefix = options.working_dir + "/" + split_name + "." + reference_name
        align(reads_end_1_split_fastq, reads_end_2_split_fastq, bam_prefix,
              bam_filenames)

    # Run the alignment commands
    cmdrun.prun()

    # Maximum number of files to merge at once
    merge_max = 100

    # Merge merge_max at a time
    current_merge_bam_filenames = []
    intermediate_bam_filenames = []
    for bam_filename in bam_filenames:

        # Maintain a list of the current bam filenames
        current_merge_bam_filenames.append(bam_filename)

        # Merge current list if we have reached the max or the last filename
        if len(current_merge_bam_filenames
               ) == merge_max or bam_filename == bam_filenames[-1]:

            # Create intermediate bam filename
            intermediate_bam_filename = options.working_dir + "/" + name + "." + reference_name + ".intermediate.%d.bam" % len(
                intermediate_bam_filenames)

            # Merge bam files or copy if theres only one
            if len(current_merge_bam_filenames) == 1:
                os.rename(current_merge_bam_filenames[0],
                          intermediate_bam_filename)
            else:
                merge_command = [
                    options.samtools_bin, "merge", "%(intermediate)s"
                ]
                merge_inputs = {}
                for merge_index, current_merge_bam_filename in enumerate(
                        current_merge_bam_filenames):
                    merge_input_id = "merge%d" % merge_index
                    merge_command.append("%(" + merge_input_id + ")s")
                    merge_inputs[merge_input_id] = current_merge_bam_filename
                cmdrun.padd(merge_command, merge_inputs,
                            {'intermediate': intermediate_bam_filename})

            # Add intermediate filename to list and clear merge filename list
            intermediate_bam_filenames.append(intermediate_bam_filename)
            current_merge_bam_filenames = []

    # Run the merge commands
    cmdrun.prun()

    # Merge the intermediate bam files or copy if theres only one
    if len(intermediate_bam_filenames) == 1:
        os.rename(intermediate_bam_filenames[0], output_bam)
    else:
        merge_command = [options.samtools_bin, "merge", "%(output)s"]
        merge_inputs = {}
        for merge_index, intermediate_bam_filename in enumerate(
                intermediate_bam_filenames):
            merge_input_id = "merge%d" % merge_index
            merge_command.append("%(" + merge_input_id + ")s")
            merge_inputs[merge_input_id] = intermediate_bam_filename
        cmdrun.run(merge_command, merge_inputs, {'output': output_bam})

    remove_files(bam_filenames)
    remove_files(intermediate_bam_filenames)

    if options.remove_split:
        remove_files(split_filenames)
        os.remove(split_catalog_filename)