예제 #1
0
    def simple(fastq=None, fp=None):
        """
        Return number of reads in a FASTQ file

        Uses the FASTQFile.nreads function to do the counting.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Number of reads

        """
        return FASTQFile.nreads(fastq=fastq, fp=fp)
def batch_fastqs(fastqs, nbatches, basename="batched", out_dir=None):
    """
    Splits reads from one or more Fastqs into batches

    Concatenates input Fastq files and then splits
    reads into smaller Fastqs using the external 'batch'
    utility.

    Arguments:
      fastqs (list): list of paths to one or more Fastq
        files to take reads from
      nbatches (int): number of batches to output reads
        into
      basename (str): optional basename to use for the
        output Fastq files (default: 'batched')
      out_dir (str): optional path to a directory where
        the batched Fastqs will be written
    """
    # Count the total number of reads
    print "Fetching read counts:"
    nreads = 0
    for fq in fastqs:
        n = FASTQFile.nreads(fq)
        print "%s:\t%d" % (os.path.basename(fq), n)
        nreads += n
    print "Total reads: %d" % nreads

    # Determine batch size
    batch_size = nreads / nbatches
    if nreads % batch_size:
        # Round up batch size
        batch_size += 1
    assert (batch_size * nbatches >= nreads)
    print "Creating batches of %d reads" % batch_size

    # Check if fastqs are compressed
    gzipped = fastqs[0].endswith('.gz')
    if gzipped:
        batch_cmd = Command('zcat')
    else:
        batch_cmd = Command('cat')

    # Get the read number
    read_number = get_read_number(fastqs[0])
    suffix = ".r%s.fastq" % read_number

    # Build and run the batching command
    batch_cmd.add_args(*fastqs)
    batch_cmd.add_args('|', 'split', '-l', batch_size * 4, '-d', '-a', 3,
                       '--additional-suffix=%s' % suffix, '-',
                       os.path.join(out_dir, "%s.B" % basename))
    batch_script = os.path.join(out_dir, "batch.sh")
    batch_cmd.make_wrapper_script("/bin/bash", batch_script)

    # Check for successful exit code
    retcode = Command("/bin/bash",
                      batch_script).run_subprocess(working_dir=out_dir)
    if retcode != 0:
        raise Exception("Batching failed: exit code %s" % retcode)

    # Collect and return the batched Fastq names
    batched_fastqs = [
        os.path.join(out_dir, "%s.B%03d%s" % (basename, i, suffix))
        for i in xrange(0, nbatches)
    ]
    return batched_fastqs
예제 #3
0
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn, fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (
                            fastq, bcf_utils.format_file_size(fsize), nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn, fastq)
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq: