def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, sam_filename, log_filename
            jobs.append(x)

        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                              pair1,
                                              fastq_file2=pair2,
                                              sam_file=sam_filename,
                                              num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
    def set_out_attributes(self, antecedents, out_attributes):
        #import os
        #from genomicode import config
        #from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils

        group_node, fastq_node, reference_node = antecedents
        fastq_files = module_utils.find_merged_fastq_files(
            group_node.identifier, fastq_node.identifier)
        assert fastq_files, "No fastq files."
        ref = alignlib.create_reference_genome(reference_node.identifier)

        # Possibilities:
        # 1.  All single.
        # 2.  All paired.
        # 3.  Mixed.
        attrs = out_attributes.copy()
        all_pair2 = [x[-1] for x in fastq_files]
        uniq_pair2 = {}.fromkeys(all_pair2).keys()
        if uniq_pair2 == [None]:
            # All single.
            attrs["orientation"] = "single"
            return attrs
        if None in all_pair2:
            # Mixed.
            raise AssertionError, "Mixed single and paired-end."
        # All paired.

        # Optimization: check just the first group of FASTQ files and
        # assume they all have the same orientation.
        sample, pair1_filename, pair2_filename = fastq_files[0]
        x = get_paired_orientation(
            ref.fasta_file_full, pair1_filename, pair2_filename)
        orient, reads_ns, reads_fr, reads_rf, reads_ff = x
        #orientation = "paired"
        #if x:
        #    orientation = "paired_%s" % x
        #attrs["orientation"] = orientation
        attrs["orientation"] = "paired_%s" % orient
        key = (group_node.identifier, fastq_node.identifier,
               reference_node.identifier)
        self.cache[key] = reads_ns, reads_fr, reads_rf, reads_ff
        return attrs
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, group_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(group_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "No fastq files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        metadata = {}

        orientation = None
        reads_ns = reads_fr = reads_rf = reads_ff = None

        # Possibilities:
        # 1.  All single.
        # 2.  All paired.
        # 3.  Mixed.  (not handled)
        all_pair2 = [x[-1] for x in fastq_files]
        uniq_pair2 = {}.fromkeys(all_pair2).keys()
        if uniq_pair2 == [None]:
            # All single.
            orientation = "single"
        elif None in all_pair2:
            # Mixed.
            raise AssertionError, "Mixed single and paired-end."
        else:
            # All paired.
            # Optimization: check just the first group of FASTQ files and
            # assume they all have the same orientation.
            sample, pair1_filename, pair2_filename = fastq_files[0]
            x = get_paired_orientation(ref.fasta_file_full, pair1_filename,
                                       pair2_filename)
            orient, reads_ns, reads_fr, reads_rf, reads_ff = x
            orientation = "paired_%s" % orient

        assert orientation

        x = mlib.Orientation(orientation, reads_ns, reads_fr, reads_rf,
                             reads_ff)
        mlib.write_orientation(x, outfile)
        return metadata
Esempio n. 4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, ref_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Do a quick check to make sure the reference is correct.
        # Otherwise, error may be hard to disgnose.
        alignlib.assert_is_STAR_reference(ref.path)

        metadata = {}
        metadata["tool"] = "STAR %s" % alignlib.get_STAR_version()

        # Figure out the strandedness.
        is_stranded = False

        # STAR --runThreadN 40 --genomeDir test05 \
        #   --readFilesIn test.fastq/test03_R1_001.fastq \
        #   test.fastq/test03_R2_001.fastq --outFileNamePrefix test06.
        # If unstranded, add --outSAMstrandField intronMotif

        # Make a list of the jobs to run.
        jobs = []  # list of filelib.GenericObject objects
        for x in fastq_files:
            sample, pair1, pair2 = x
            out_prefix = "%s." % sample
            bam_filename = os.path.join(out_path,
                                        "%sAligned.out.bam" % out_prefix)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = filelib.GenericObject(
                sample=sample,
                pair1=pair1,
                pair2=pair2,
                out_prefix=out_prefix,
                bam_filename=bam_filename,
                log_filename=log_filename,
            )
            jobs.append(x)

        # Run pass 1.
        commands = []
        for j in jobs:
            x = os.path.join(out_path, j.out_prefix)
            cmd = alignlib.make_STAR_command(ref.path, x, num_cores,
                                             is_stranded, j.pair1, j.pair2,
                                             j.log_filename)
            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz(j.bam_filename):
                parallel.sshell(cmd, path=out_path)
            filelib.assert_exists_nz(j.bam_filename)
            commands.append(cmd)

        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        return metadata
Esempio n. 6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, ref_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        # Do a quick check to make sure the reference is correct.
        # Otherwise, error may be hard to disgnose.
        alignlib.assert_is_STAR_reference(ref.path)

        metadata = {}
        metadata["tool"] = "STAR %s" % alignlib.get_STAR_version()

        x = mlib.get_user_option(user_options,
                                 "two_pass",
                                 allowed_values=["no", "yes"])
        two_pass = (x == "yes")

        # Figure out the strandedness.
        is_stranded = stranded.stranded != "unstranded"

        # STAR --runThreadN 40 --genomeDir test05 \
        #   --readFilesIn test.fastq/test03_R1_001.fastq \
        #   test.fastq/test03_R2_001.fastq --outFileNamePrefix test06.
        # If unstranded, add --outSAMstrandField intronMotif

        # Make a list of the jobs to run.
        jobs = []  # list of filelib.GenericObject objects
        for x in fastq_files:
            sample, pair1, pair2 = x
            pass1_out_prefix = "p1.%s." % sample
            pass2_out_prefix = "%s." % sample
            pass1_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass1_out_prefix)
            pass2_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass2_out_prefix)
            sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample)
            log1_filename = os.path.join(out_path, "p1.%s.log" % sample)
            log2_filename = os.path.join(out_path, "%s.log" % sample)

            x = filelib.GenericObject(
                sample=sample,
                pair1=pair1,
                pair2=pair2,
                pass1_out_prefix=pass1_out_prefix,
                pass2_out_prefix=pass2_out_prefix,
                pass1_bam_filename=pass1_bam_filename,
                pass2_bam_filename=pass2_bam_filename,
                sjdb_filename=sjdb_filename,
                log1_filename=log1_filename,
                log2_filename=log2_filename,
            )
            jobs.append(x)

        # Run pass 1.
        commands = []
        for j in jobs:
            x = os.path.join(out_path, j.pass1_out_prefix)
            cmd = alignlib.make_STAR_command(ref.path, x, num_cores,
                                             is_stranded, j.pair1, j.pair2,
                                             j.log1_filename)
            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz(j.pass1_bam_filename):
                parallel.sshell(cmd, path=out_path)
            filelib.assert_exists_nz(j.pass1_bam_filename)
            commands.append(cmd)

        if two_pass:
            # Make a new index with the splice junction information.
            sj_index = os.path.join(out_path, "genome.2pass")
            x = [x.sjdb_filename for x in jobs]
            filelib.assert_exists_nz_many(x)
            x = alignlib.make_STAR_index_command(ref.fasta_file_full,
                                                 sj_index,
                                                 sjdb_files=x,
                                                 num_cores=num_cores)
            x = "%s >& genome.2pass.log" % x
            commands.append(x)

            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz("genome.2pass.log"):
                parallel.sshell(x, path=out_path)
            alignlib.assert_is_STAR_reference(sj_index)

        # Run pass 2.
        for j in jobs:
            # For debugging.  If this file already exists, skip it.
            if os.path.exists(j.pass2_bam_filename):
                continue
            if two_pass:
                x = os.path.join(out_path, j.pass2_out_prefix)
                cmd = alignlib.make_STAR_command(sj_index, x, num_cores,
                                                 is_stranded, j.pair1, j.pair2,
                                                 j.log2_filename)
                parallel.sshell(cmd, path=out_path)
                commands.append(cmd)
            else:
                # link pass1_bam_filename to pass2_bam_filename
                os.symlink(j.pass1_bam_filename, j.pass2_bam_filename)
                continue
            filelib.assert_exists_nz(j.pass2_bam_filename)

        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        # STAR takes 28 Gb per process.  Make sure we don't use up
        # more memory than is available on the machine.
        # Defaults:
        # --limitGenomeGenerateRAM   31000000000
        # --outFilterMismatchNmax    10             Num mismatches.
        #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores)
        #metadata["num_cores"] = nc
        #parallel.pshell(commands, max_procs=nc, path=out_path)

        # Make sure the analysis completed successfully.
        #x = [x[-2] for x in jobs]  # sam_filename
        #filelib.assert_exists_nz_many(x)
        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_filename):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        fastq_node, sample_node, align_node = antecedents
        fastq_data = mlib.find_merged_fastq_files(sample_node.identifier,
                                                  fastq_node.identifier)
        assert fastq_data, "I could not find any FASTQ files."
        align_filenames = filelib.list_files_in_path(align_node.identifier,
                                                     endswith=".matches.txt")
        assert align_filenames, "No .matches.txt files."
        align_filenames.sort()
        metadata = {}

        assert len(fastq_data) == len(align_filenames), \
               "Mismatch: num samples %d %d" % (
            len(fastq_data), len(align_filenames))

        num_mismatches = mlib.get_user_option(user_options,
                                              "num_mismatches",
                                              type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2fastqdata = {}
        for x in fastq_data:
            sample, f1, f2 = x
            sample2fastqdata[sample] = x

        # list of (sample, align_filename, summary_filename,
        #   fastq_filename1, fastq_filename2)
        jobs = []
        for in_filename in align_filenames:
            p, f = os.path.split(in_filename)
            # <sample>.matches.txt
            ext = ".matches.txt"
            assert f.endswith(ext)
            sample = f[:-len(ext)]
            assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample
            summary_filename = "%s.summary.txt" % sample
            x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample]
            x = sample, in_filename, summary_filename, \
                fastq_filename1, fastq_filename2
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, align_filename, summary_filename, \
                    fastq_file1, fastq_file2 = x
            args = align_filename, fastq_file1, fastq_file2, num_mismatches
            keywds = {
                "temp_path": ".",
                "outfile": summary_filename,
            }
            x = summarize_matches_file, args, keywds
            jobs2.append(x)

        # Since this can take a lot of memory (depending on the number
        # of reads, can easily take 8 Gb), do just 1 process at a
        # time.  Also, I/O intensive.  Don't do too many at a time.
        #MAX_PROCS = 1
        MAX_PROCS = 4
        nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS)
        #nc = min(MAX_PROCS, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1)
        metadata["num_cores"] = nc
        assert len(results) == len(jobs2)

        # Put together the results in a table.
        handle = open(out_filename, 'w')
        header = "sample", "match", "total", "RPM", "match", "mismatch"
        print >> handle, "\t".join(header)
        for x in zip(jobs, results):
            x, d = x
            sample, in_filename, summary_filename, \
                    fastq_filename1, fastq_filename2 = x
            match = d["perfect_alignments"]
            total = d["total_alignments"]
            rpm = int(float(match) / total * 1E6)
            perc_match = d["perc_perfect"]
            perc_mismatch = 1 - d["perc_perfect"]
            x = sample, match, total, rpm, perc_match, perc_mismatch
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()
        return metadata
Esempio n. 8
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        MAX_CORES = 4  # I/O intensive.

        fastq_node, sample_node, bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier,
                                                    fastq_node.identifier,
                                                    as_dict=True)

        metadata = {}

        jobs = []  # list of (sample, bam_file, fastq_file)
        for filename in bam_filenames:
            path, sample, ext = mlib.splitpath(filename)
            assert sample in sample2fastq, "Missing fastq: %s" % sample
            fastq1, fastq2 = sample2fastq[sample]
            x = sample, filename, fastq1
            jobs.append(x)

        funcalls = []
        for x in jobs:
            sample, bam_filename, fastq_filename = x
            # Count the number of reads.
            x1 = count_reads, (fastq_filename, ), {}
            # Count the number of alignments.
            x2 = count_alignments, (bam_filename, ), {}
            funcalls.append(x1)
            funcalls.append(x2)
        assert len(funcalls) == len(jobs) * 2

        nc = min(num_cores, MAX_CORES)
        results = parallel.pyfun(funcalls, num_procs=nc)
        metadata["num_cores"] = nc

        # list of (sample, aligns, aligned_reads, total_reads, perc_aligned).
        results2 = []
        for i, x in enumerate(jobs):
            sample, bam_filename, fastq_filename = x
            x1 = results[i * 2]
            x2 = results[i * 2 + 1]
            total_reads = x1
            aligned_reads, alignments = x2
            perc_aligned = float(aligned_reads) / total_reads
            x = sample, alignments, aligned_reads, total_reads, perc_aligned
            results2.append(x)
        results = results2

        # sort by sample name
        results.sort()

        # Make table where the rows are the samples and the columns
        # are the statistics.
        table = []
        header = ("Sample", "Alignments", "Aligned Reads", "Total Reads",
                  "Perc Aligned")
        table.append(header)
        for x in results:
            sample, alignments, aligned_reads, total_reads, perc_aligned = x

            x1 = parselib.pretty_int(alignments)
            x2 = parselib.pretty_int(aligned_reads)
            x3 = parselib.pretty_int(total_reads)
            x4 = "%.2f%%" % (perc_aligned * 100)
            x = sample, x1, x2, x3, x4
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = mlib.findbin("txt2xls", quote=True)
        parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile))
        return metadata
Esempio n. 9
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils

        fastq_node, group_node, reference_node = antecedents
        fastq_path = fastq_node.identifier
        assert os.path.exists(fastq_path)
        assert os.path.isdir(fastq_path)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)
        #reference_fa = module_utils.find_bwa_reference(index_path)

        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Find the merged fastq files.
        x = module_utils.find_merged_fastq_files(
            group_node.identifier, fastq_path)
        grouped_fastq_files = x

        # Make sure no duplicate samples.
        x1 = [x[0] for x in grouped_fastq_files]
        x2 = {}.fromkeys(x1).keys()
        assert len(x1) == len(x2), "dup sample"

        # Make a list of all the jobs to do.
        jobs = []   # list of (sample, pair1, pair2, bam_filename)
        for x in grouped_fastq_files:
            sample, pair1, pair2 = x
            bam_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, bam_filename, log_filename
            jobs.append(x)

        # Uses ~6 Gb per process.
        # Calculate the number of cores per job.
        nc = max(1, num_cores/len(jobs))
        metadata["num cores"] = nc
        
        # Make the bwa commands.
        commands = []
        for x in jobs:
            sample, pair1, pair2, bam_filename, log_filename = x
            x = alignlib.make_bwa_mem_command(
                ref.fasta_file_full, log_filename, pair1,
                fastq_file2=pair2, bam_filename=bam_filename, num_threads=nc)
            commands.append(x)

        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x1 = [x[-2] for x in jobs]
        x2 = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)
        return metadata
Esempio n. 10
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node, gene_node = \
                    antecedents
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "TopHat %s" % alignlib.get_tophat_version()

        # Get the GTF file, if any.
        #gtf_file = module_utils.get_user_option(
        #    user_options, "tophat_gtf_file", check_file=True)
        transcriptome_fa = mlib.get_user_option(
            user_options, "tophat_transcriptome_fa", check_file=True)
        assert gtf_file or transcriptome_fa, (
            "Either tophat_gtf_file or tophat_transcriptome_fa (preferred) "
            "must be provided.")

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            tophat_path = os.path.join(out_path, "%s.tophat" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, tophat_path, log_filename
            jobs.append(x)
        
        # Generate tophat commands for each of the files.
        s2ltype = {
            "unstranded" : "fr-unstranded",
            "firststrand" : "fr-firststrand",
            "secondstrand" : "fr-secondstrand",
            }
        assert stranded.stranded in s2ltype, "Unknown stranded: %s" % \
               stranded.stranded
        library_type = s2ltype[stranded.stranded]

        # Takes ~3 Gb per process.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, pair1, pair2, tophat_path, log_filename = x
            nc = max(1, num_cores/len(jobs))
            x = alignlib.make_tophat_command(
                ref.fasta_file_full, tophat_path, pair1, fastq_file2=pair2,
                gtf_file=gtf_file, transcriptome_fa=transcriptome_fa,
                library_type=library_type, num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[3] for x in jobs]  # out_path
        x = [os.path.join(x, "accepted_hits.bam") for x in x]
        bam_filenames = x
        filelib.assert_exists_nz_many(bam_filenames)
        
        return metadata
Esempio n. 11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, orient_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie1 %s" % alignlib.get_bowtie1_version()

        # With low alignment percentages, might want to play around with:
        # - insert size
        # - maximum mismatch

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, sam_filename, log_filename
            jobs.append(x)

        # Generate bowtie1 commands for each of the files.
        attr2orient = {
            "single": None,
            "paired_fr": "fr",
            "paired_rf": "rf",
            "paired_ff": "ff",
        }
        orientation = attr2orient[orient.orientation]
        #x = sample_node.data.attributes["orientation"]
        #orientation = attr2orient[x]

        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = alignlib.make_bowtie1_command(ref.fasta_file_full,
                                              sam_filename,
                                              pair1,
                                              fastq_file2=pair2,
                                              orientation=orientation,
                                              num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            # Make sure sam file created.
            assert filelib.exists_nz(sam_filename), \
                   "Missing: %s" % sam_filename
            # Make sure there are some alignments.
            x = open(log_filename).read()
            assert x.find("No alignments") < 0, "No alignments"

        return metadata
Esempio n. 12
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, orient_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Bowtie2 doesn't handle files with spaces in them.  Make
        # temporary files without spaces.

        # Make a list of the jobs to run.
        jobs = []
        for i, x in enumerate(fastq_files):
            sample, pair1, pair2 = x
            bam_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            sample_h = hashlib.hash_var(sample)
            temp_pair1 = "%d_%s_1.fa" % (i, sample_h)
            temp_pair2 = None
            if pair2:
                temp_pair2 = "%d_%s_2.fa" % (i, sample_h)
            j = filelib.GenericObject(sample=sample,
                                      pair1=pair1,
                                      pair2=pair2,
                                      temp_pair1=temp_pair1,
                                      temp_pair2=temp_pair2,
                                      bam_filename=bam_filename,
                                      log_filename=log_filename)
            jobs.append(j)

        for j in jobs:
            os.symlink(j.pair1, j.temp_pair1)
            if pair2:
                os.symlink(j.pair2, j.temp_pair2)

        # Generate bowtie2 commands for each of the files.
        attr2orient = {
            "single": None,
            "paired_fr": "fr",
            "paired_rf": "rf",
            "paired_ff": "ff",
        }
        orientation = attr2orient[orient.orientation]
        #x = sample_node.data.attributes["orientation"]
        #orientation = attr2orient[x]

        # Takes ~4 Gb per job.
        samtools = mlib.findbin("samtools")
        sq = parallel.quote
        commands = []
        for j in jobs:
            #sample, pair1, pair2, bam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))

            # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr
            #  2> test.log | samtools view -bS -o test.bam -
            x1 = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                               j.temp_pair1,
                                               fastq_file2=j.temp_pair2,
                                               orientation=orientation,
                                               num_threads=nc)
            x2 = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(j.bam_filename),
                "-",
            ]
            x2 = " ".join(x2)
            x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2)
            #x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x1 = [x.bam_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Esempio n. 13
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sai_node, orient_node, sample_node, reference_node = \
                    antecedents
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_node.identifier)
        sai_path = sai_node.identifier
        assert filelib.dir_exists(sai_path)
        orient = mlib.read_orientation(orient_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Technically, doesn't need the SampleGroupFile, since that's
        # already reflected in the sai data.  But better, because the
        # sai data might not always be generated by BETSY.

        # Find the merged fastq files.

        # Find the sai files.
        sai_filenames = filelib.list_files_in_path(
            sai_path, endswith=".sai", case_insensitive=True)
        assert sai_filenames, "No .sai files."

        bwa = mlib.findbin("bwa")
        # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq>
        # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai>
        #   <input_1.fq> <input_2.fq> >

        # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam)
        # all full paths
        jobs = []
        for x in fastq_files:
            sample, pair1_fq, pair2_fq = x

            # The sai file should be in the format:
            # <sai_path>/<sample>.sai    Single end read
            # <sai_path>/<sample>_1.sai  Paired end read
            # <sai_path>/<sample>_2.sai  Paired end read
            # Look for pair1_sai and pair2_sai.
            pair1_sai = pair2_sai = None
            for sai_filename in sai_filenames:
                p, s, e = mlib.splitpath(sai_filename)
                assert e == ".sai"
                if s == sample:
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_1" % (sample):
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_2" % (sample):
                    assert not pair2_sai
                    pair2_sai = sai_filename
            assert pair1_sai, "Missing .sai file: %s" % sample
            if pair2_fq:
                assert pair2_sai, "Missing .sai file 2: %s" % sample
            if pair2_sai:
                assert pair2_fq, "Missing .fq file 2: %s" % sample
                
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                sam_filename, log_filename
            jobs.append(x)

        orientation = orient.orientation
        #orientation = sample_node.data.attributes["orientation"]
        assert orientation in ["single", "paired_fr", "paired_rf"]

        # Make a list of bwa commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                    sam_filename, log_filename = x
            if orientation == "single":
                assert not pair2_fq
                assert not pair2_sai

            samse = "samse"
            if orientation.startswith("paired"):
                samse = "sampe"

            x = [
                sq(bwa),
                samse,
                "-f", sq(sam_filename),
                sq(ref.fasta_file_full),
                ]
            if orientation == "single":
                x += [
                    sq(pair1_sai),
                    sq(pair1_fq),
                ]
            else:
                y = [
                    sq(pair1_sai),
                    sq(pair2_sai),
                    sq(pair1_fq),
                    sq(pair2_fq),
                    ]
                if orientation == "paired_rf":
                    y = [
                        sq(pair2_sai),
                        sq(pair1_sai),
                        sq(pair2_fq),
                        sq(pair1_fq),
                        ]
                x += y
            x += [
                ">&", sq(log_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)
        
        return metadata
Esempio n. 14
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import genomelib
        from genomicode import config
        from Betsy import module_utils as mlib

        fasta_node, bam_node, sample_node, orient_node = antecedents
        fasta_data = mlib.find_merged_fastq_files(sample_node.identifier,
                                                  fasta_node.identifier,
                                                  find_fasta=True)
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        # TODO: Try to figure out version.
        metadata = {}
        metadata["tool"] = "RSeQC (unknown version)"

        pyrseqc = mlib.findbin("pyrseqc")

        gene_model = mlib.get_user_option(user_options,
                                          "gene_model",
                                          not_empty=True,
                                          allowed_values=["hg19"])
        if gene_model == "hg19":
            gene_path = config.rseqc_hg19
        else:
            raise AssertionError, "Unhandled: %s" % gene_model

        filelib.dir_exists(gene_path)
        gene_model_bed = os.path.join(gene_path, "RefSeq.bed12")
        housekeeping_model_bed = os.path.join(gene_path,
                                              "HouseKeepingGenes.bed")

        sample2fastadata = {}
        for x in fasta_data:
            sample, f1, f2 = x
            sample2fastadata[sample] = x

        is_paired = orient.orientation.startswith("paired")

        # Guess the read length.  Read the first fasta.
        assert sample2fastadata
        x = sample2fastadata.keys()[0]
        filename = sample2fastadata[x][1]
        lengths = {}  # length -> count
        for i, x in enumerate(genomelib.read_fasta_many(filename)):
            if i >= 100:
                break
            title, sequence = x
            l = len(sequence)
            lengths[l] = lengths.get(l, 0) + 1
        # Use the most common length.
        c_length = c_count = None
        for (l, c) in lengths.iteritems():
            if c_count is None or c > c_count:
                c_length, c_count = l, c
        assert c_length
        read_length = c_length

        jobs = []  # sample, bam_filename, fasta_file1, fasta_file2, outdir
        for bam_filename in bam_filenames:
            # <path>/<sample>.bam
            p, sample, e = mlib.splitpath(bam_filename)
            assert sample in sample2fastadata
            x, f1, f2 = sample2fastadata[sample]
            outdir = os.path.join(out_path, sample)
            x = sample, bam_filename, f1, f2, outdir
            jobs.append(x)

        # Some of the modules of RSeQC uses a lot of memory.  Have
        # seen a Python process take 33 Gb, and an R process take 200
        # Gb.  However, most of the modules use much less memory.  So
        # run one pyrseqc at a time, and run each one of those
        # processes in parallel.  Is probably slower than running
        # multiple pyrseqc, but takes less memory.
        commands = []
        for x in jobs:
            sample, bam_filename, fasta_filename1, fasta_filename2, outdir = x

            # pyrseqc.py -j 20 --paired_end rqc11.bam rqc14.fa 76 \
            #   mod07.txt hg19.HouseKeepingGenes.bed rqc21 --dry_run
            x = [
                mlib.sq(pyrseqc),
                "-j",
                str(num_cores),
            ]
            if is_paired:
                x += ["--paired_end"]
            x += [
                mlib.sq(bam_filename),
                mlib.sq(fasta_filename1),
                str(read_length),
                mlib.sq(gene_model_bed),
                mlib.sq(housekeeping_model_bed),
                mlib.sq(outdir),
            ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        # pyrseqc takes up to ~40 Gb per process.
        # read_distribution.py takes 33 Gb.
        # read_quality.py spins off an R process that takes ~200 Gb.
        # Make sure we don't use up more memory than is available on
        # the machine.
        #nc = mlib.calc_max_procs_from_ram(60, upper_max=num_cores)
        #metadata["num cores"] = nc
        #x = parallel.pshell(commands, max_procs=nc)

        # Because of memory, just run one at a time, but each one, use
        # multiple cores.
        for cmd in commands:
            x = parallel.sshell(cmd)
            assert x.find("Traceback") < 0, x

        filelib.assert_exists_nz(out_path)

        return metadata
Esempio n. 15
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.  Also,
        # takes 4-5 Gb RAM per process.
        MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4)

        fastq_node, sample_node, summary_node = antecedents
        fastq_path = fastq_node.identifier
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_path)
        assert fastq_files, "I could not find any FASTQ files."
        summary_filenames = filelib.list_files_in_path(
            summary_node.identifier, endswith=".matches.txt")
        assert summary_filenames, "No .matches.txt files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_mismatches = mlib.get_user_option(
            user_options, "num_mismatches", type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2summary = {}  # sample -> summary_filename
        for filename in summary_filenames:
            # <sample>.matches.txt
            p, f = os.path.split(filename)
            assert f.endswith(".matches.txt")
            sample = f.replace(".matches.txt", "")
            assert sample not in sample2summary
            sample2summary[sample] = filename

        # list of (sample, fastq_file1, fastq_file2, summary_filename,
        #          out_file1, out_file2, subtracted_file1, subtracted_file2)
        jobs = []
        for x in fastq_files:
            sample, pair1_fastq, pair2_fastq = x
            assert sample in sample2summary, \
                   "Missing summary for sample: %s" % sample
            p1, f1 = os.path.split(pair1_fastq)
            if pair2_fastq:
                p2, f2 = os.path.split(pair2_fastq)
                assert p1 == p2
            out1_fastq = os.path.join(out_path, f1)
            sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1)
            out2_fastq = None
            sub2_fastq = None
            if pair2_fastq:
                out2_fastq = os.path.join(out_path, f2)
                sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2)
            x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \
                out1_fastq, out2_fastq, sub1_fastq, sub2_fastq
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, pair1_fastq, pair2_fastq, summary_file, \
                    out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x
            x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \
                num_mismatches
            x = subtract_mouse_reads, x, {}
            jobs2.append(x)
            if pair2_fastq:
                x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \
                    num_mismatches
                x = subtract_mouse_reads, x, {}
                jobs2.append(x)

        nc = min(MAX_CORES, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5)
        assert len(results) == len(jobs2)
        metadata["num_cores"] = nc
        
        # Make sure the fastq files were generated.
        x1 = [x[4] for x in jobs]
        x2 = [x[5] for x in jobs]
        x = x1 + x2
        x = [x for x in x if x]
        # BUG: If all reads were removed, then this will fail incorrectly.
        filelib.assert_exists_nz_many(x)

        return metadata
Esempio n. 16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, group_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(group_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "No FASTQ files found."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Make sure no duplicate samples.
        x1 = [x[0] for x in fastq_files]
        x2 = {}.fromkeys(x1).keys()
        assert len(x1) == len(x2), "dup sample"

        # Make a list of all FASTQ files to align.
        fastq_filenames = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            assert pair1
            fastq_filenames.append(pair1)
            if pair2:
                fastq_filenames.append(pair2)

        # Make a list of all the jobs to do.
        jobs = []  # list of (fastq_filename, sai_filename)
        for in_filename in fastq_filenames:
            in_path, in_file = os.path.split(in_filename)
            x = in_file
            if x.lower().endswith(".fq"):
                x = x[:-3]
            elif x.lower().endswith(".fastq"):
                x = x[:-6]
            sai_filename = os.path.join(out_path, "%s.sai" % x)
            log_filename = os.path.join(out_path, "%s.log" % x)
            x = in_filename, sai_filename, log_filename
            jobs.append(x)

        # Calculate the number of threads per job.
        nc = max(1, num_cores / len(jobs))

        # Make the bwa commands.
        commands = []
        for x in jobs:
            fastq_filename, sai_filename, log_filename = x
            x = alignlib.make_bwa_aln_command(ref.fasta_file_full,
                                              fastq_filename,
                                              sai_filename,
                                              log_filename,
                                              num_threads=nc)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, sai_filename, log_filename = x
            assert filelib.exists_nz(sai_filename), \
                   "Missing: %s" % sai_filename
        return metadata
Esempio n. 17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        fastq_node, sample_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        adapters_filename = mlib.get_user_option(user_options,
                                                 "adapters_fasta",
                                                 not_empty=True,
                                                 check_file=True)
        if " " in adapters_filename:
            os.symlink(adapters_filename, "adapters.txt")
            adapters_filename = "adapters.txt"

        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            p1, f1 = os.path.split(pair1)
            trimmed1 = os.path.join(out_path, f1)
            trimmed2 = None
            if pair2:
                p2, f2 = os.path.split(pair2)
                trimmed2 = os.path.join(out_path, f2)
            # BUG: Will be overwritten.  Need to give unpaired files
            # unique names.
            unpaired1 = os.path.join(out_path, "unpaired_1.fasta")
            unpaired2 = os.path.join(out_path, "unpaired_2.fasta")
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, trimmed1, trimmed2, \
                unpaired1, unpaired2, log_filename
            jobs.append(x)

        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, pair1, pair2, trimmed1, trimmed2, unpaired1, unpaired2, \
                    log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = _make_trimmomatic_cmd(pair1,
                                      pair2,
                                      trimmed1,
                                      trimmed2,
                                      unpaired1,
                                      unpaired2,
                                      adapters_filename,
                                      num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            sample, pair1, pair2, trimmed1, trimmed2, unpaired1, unpaired2, \
                    log_filename = x
            # Make sure outfile created.
            assert filelib.exists_nz(trimmed1), \
                   "Missing: %s" % trimmed1
            if trimmed2:
                assert filelib.exists_nz(trimmed2), \
                       "Missing: %s" % trimmed2
            x = open(log_filename).read()
            assert not x.startswith("Usage:"), "usage problem"
        return metadata