Python getreads Examples

Programming Language: Python

Namespace/Package Name: bcftbx.ngsutils

Method/Function: getreads

Examples at hotexamples.com: 7

Python getreads - 7 examples found. These are the top rated real world Python examples of bcftbx.ngsutils.getreads extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: split_fastq.py Project: golharam/genomics

def get_fastq_lanes(fastq):
    """
    Return list of lanes present in Fastq file

    Arguments:
      fastq (str): path to Fastq file (can
        be gzipped)

    Returns:
      Tuple: tuple (n,lanes) where ``n`` is a the
        number of reads and ``lanes`` is a list
        of integer lane numbers.
    """
    regex = re.compile(r"^([^:]*:){3}(\d*):")
    nreads = 0
    lanes = set()
    for read in getreads(fastq):
        nreads += 1
        try:
            lane = regex.match(''.join(read)).group(2)
            lanes.add(int(lane))
        except AttributeError:
            raise Exception("Failed to find lane in read %s: "
                            "not a valid Fastq file?" % '\n'.join(read))
    return (nreads, sorted(list(lanes)))

Example #2

Show file

File: split_fastq.py Project: fls-bioinformatics-core/genomics

def get_fastq_lanes(fastq):
    """
    Return list of lanes present in Fastq file

    Arguments:
      fastq (str): path to Fastq file (can
        be gzipped)

    Returns:
      Tuple: tuple (n,lanes) where ``n`` is a the
        number of reads and ``lanes`` is a list
        of integer lane numbers.
    """
    regex = re.compile(r"^([^:]*:){3}(\d*):")
    nreads = 0
    lanes = set()
    for read in getreads(fastq):
        nreads += 1
        try:
            lane = regex.match(''.join(read)).group(2)
            lanes.add(int(lane))
        except AttributeError:
            raise Exception("Failed to find lane in read %s: "
                            "not a valid Fastq file?"
                            % '\n'.join(read))
    return (nreads,sorted(list(lanes)))

Example #3

Show file

def fastq_strand(argv, working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file")
    p.add_argument("r2",
                   metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g",
                   "--genome",
                   dest="star_genomedirs",
                   metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o",
                   "--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c",
                   "--conf",
                   metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print("READ1\t: %s" % args.r1)
    print("READ2\t: %s" % args.r2)
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print("STAR\t: %s" % star_exe)
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print("Conf file\t: %s" % args.conf)
        star_genomedirs = []
        with open(args.conf, 'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name, star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print("Genomes:")
    for genome in star_genomedirs:
        print("- %s" % genome)
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" % outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir, os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print("Working directory: %s" % working_dir)
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print("%d reads" % nreads)
    if args.subset == 0:
        print("Using all read pairs in Fastq files")
        subset = nreads
    elif args.subset > nreads:
        print("Actual number of read pairs smaller than requested subset")
        subset = nreads
    else:
        subset = args.subset
        print("Using random subset of %d read pairs" % subset)
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads), subset)
    fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir, os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset, 'w') as fp:
            for read in getreads_subset(os.path.abspath(fq), subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(
            outdir, "STAR.%s.outputs" %
            os.path.basename(strip_ngs_extensions(args.r1)))
        print("Output from STAR will be copied to %s" % star_output_dir)
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir, i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir, backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory',
                '--genomeDir',
                os.path.abspath(star_genomedir)
            ])
            star_cmd.extend(['--readFilesIn', fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix',
                prefix, '--runThreadN',
                str(args.n)
            ])
            print("Running %s" % ' '.join(star_cmd))
            try:
                subprocess.check_output(star_cmd, cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep, "_"))
                print("Copying STAR outputs to %s" % genome_dir)
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir, f),
                                    os.path.join(genome_dir, f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i, line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print("Sums:")
            print("- col2: %d" % sum_col2)
            print("- col3: %d" % sum_col3)
            print("- col4: %d" % sum_col4)
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3) / float(sum_col2) * 100.0
                reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print("Strand percentages:")
            print("- 1st forward: %.2f%%" % forward_1st)
            print("- 2nd reverse: %.2f%%" % reverse_2nd)
            # Append to output file
            data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2, sum_col3, sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile, 'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__, "STAR", subset))
            columns = ["Genome", "1st forward", "2nd reverse"]
            if args.counts:
                columns.extend([
                    "Unstranded", "1st read strand aligned",
                    "2nd read strand aligned"
                ])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0

Example #4

Show file

File: extract_reads.py Project: fls-bioinformatics-core/genomics

def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = optparse.OptionParser(usage="%prog -m PATTERN |-n NREADS infile "
                              "[ infile ... ]",
                              version="%prog "+__version__,
                              description=__description__)
    p.add_option('-m','--match',action='store',dest='pattern',default=None,
                 help="extract records that match Python regular "
                 "expression PATTERN")
    p.add_option('-n',action='store',dest='n',default=None,
                 help="extract N random reads from the input file(s). "
                 "If multiple files are supplied (e.g. R1/R2 pair) then "
                 "the same subsets will be extracted for each. "
                 "(Optionally a percentage can be supplied instead e.g. "
                 "'50%' to extract a subset of half the reads.)")
    p.add_option('-s','--seed',action='store',dest='seed',default=None,
                 help="specify seed for random number generator (used "
                 "for -n option; using the same seed should produce the "
                 "same 'random' sample of reads)")
    opts,args = p.parse_args(args)
    if len(args) < 1:
        p.error("Need to supply at least one input file")
    # Pattern matching option
    if opts.pattern is not None:
        if opts.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print "Extracting reads matching '%s'" % opts.pattern
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,opts.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if opts.seed is not None:
            random.seed(opts.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args[0]))
        print "Number of reads: %s" % nreads
        if len(args) > 1:
            print "Verifying read numbers match between files"
        for f in args[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print "Inconsistent numbers of reads between files"
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(opts.n)
        except ValueError:
            if str(opts.n).endswith('%'):
                nsubset = int(float(opts.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print "Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads)
            sys.exit(1)
        print "Generating set of %s random indices" % nsubset
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')

Example #5

Show file

File: extract_reads.py Project: nandr0id/genomics

def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = argparse.ArgumentParser(version="%(prog)s "+__version__,
                                description=__description__)
    p.add_argument('-m','--match',action='store',dest='pattern',
                   default=None,
                   help="extract records that match Python regular "
                   "expression PATTERN")
    p.add_argument('-n',action='store',dest='n',default=None,
                   help="extract N random reads from the input file(s). "
                   "If multiple files are supplied (e.g. R1/R2 pair) then "
                   "the same subsets will be extracted for each. "
                   "(Optionally a percentage can be supplied instead e.g. "
                   "'50%%' to extract a subset of half the reads.)")
    p.add_argument('-s','--seed',action='store',dest='seed',default=None,
                   help="specify seed for random number generator (used "
                   "for -n option; using the same seed should produce the "
                   "same 'random' sample of reads)")
    p.add_argument('infiles',metavar='infile',nargs='+',
                   help="input FASTQ, CSFASTA, or QUAL file")
    args = p.parse_args(args)
    # Pattern matching option
    if args.pattern is not None:
        if args.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print("Extracting reads matching '%s'" % args.pattern)
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,args.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if args.seed is not None:
            random.seed(args.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args.infiles[0]))
        print("Number of reads: %s" % nreads)
        if len(args.infiles) > 1:
            print("Verifying read numbers match between files")
        for f in args.infiles[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print("Inconsistent numbers of reads between files")
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(args.n)
        except ValueError:
            if str(args.n).endswith('%'):
                nsubset = int(float(args.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print("Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads))
            sys.exit(1)
        print("Generating set of %s random indices" % nsubset)
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')

Example #6

Show file

File: fastq_strand.py Project: fls-bioinformatics-core/genomics

def fastq_strand(argv,working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1",metavar="READ1",
                   default=None,
                   help="R1 Fastq file")
    p.add_argument("r2",metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g","--genome",
                   dest="star_genomedirs",metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o","--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c","--conf",metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print "READ1\t: %s" % args.r1
    print "READ2\t: %s" % args.r2
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print "STAR\t: %s" % star_exe
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print "Conf file\t: %s" % args.conf
        star_genomedirs = []
        with open(args.conf,'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name,star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print "Genomes:"
    for genome in star_genomedirs:
        print "- %s" % genome
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" %
                         outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir,
        os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print "Working directory: %s" % working_dir
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print "%d reads" % nreads
    if args.subset == 0:
        print "Using all read pairs in Fastq files"
        subset = nreads
    elif args.subset > nreads:
        print "Actual number of read pairs smaller than requested subset"
        subset = nreads
    else:
        subset = args.subset
        print "Using random subset of %d read pairs" % subset
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads),subset)
    fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir,
                                 os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset,'w') as fp:
            for read in getreads_subset(os.path.abspath(fq),
                                        subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(outdir,
                                       "STAR.%s.outputs" %
                                       os.path.basename(
                                           strip_ngs_extensions(args.r1)))
        print "Output from STAR will be copied to %s" % star_output_dir
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir,i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir,backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode','alignReads',
                '--genomeLoad','NoSharedMemory',
                '--genomeDir',os.path.abspath(star_genomedir)])
            star_cmd.extend(['--readFilesIn',
                             fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode','GeneCounts',
                '--outSAMtype','BAM','Unsorted',
                '--outSAMstrandField','intronMotif',
                '--outFileNamePrefix',prefix,
                '--runThreadN',str(args.n)])
            print "Running %s" % ' '.join(star_cmd)
            try:
                subprocess.check_output(star_cmd,cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep,"_"))
                print "Copying STAR outputs to %s" % genome_dir
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir,f),
                                    os.path.join(genome_dir,f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i,line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print "Sums:"
            print "- col2: %d" % sum_col2
            print "- col3: %d" % sum_col3
            print "- col4: %d" % sum_col4
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3)/float(sum_col2)*100.0
                reverse_2nd = float(sum_col4)/float(sum_col2)*100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print "Strand percentages:"
            print "- 1st forward: %.2f%%" % forward_1st
            print "- 2nd reverse: %.2f%%" % reverse_2nd
            # Append to output file
            data = [name,
                    "%.2f" % forward_1st,
                    "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2,sum_col3,sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile,'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__,
                                                  "STAR",
                                                  subset))
            columns = ["Genome","1st forward","2nd reverse"]
            if args.counts:
                columns.extend(["Unstranded",
                                "1st read strand aligned",
                                "2nd read strand aligned"])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0

Example #7

Show file

File: atac.py Project: fls-bioinformatics-core/auto_process_ngs

def assign_reads(args):
    """
    Assign reads to samples from batched ICELL8 ATAC Fastqs

    Intended to be invoked via 'map' or similar function

    Arguments are supplied in a single list which should
    contain the following items:

    - R1 Fastq: path to R1 Fastq file
    - R2 Fastq: path to R2 Fastq file
    - I1 Fastq: path to I1 Fastq file
    - I2 Fastq: path to I2 Fastq file
    - well list: path to the well list file
    - mode: either 'samples' or 'barcodes'
    - swap_i1_and_i2: boolean indicating whether I1 and I2
      Fastqs should be swapped for matching
    - reverse_complement: either None, 'i1', 'i2' or both
    - rewrite_fastq_headers: boolean indicating whether to
      write the matching ICELL8 barcodes into the Fastq
      read headers on output
    - working_dir: working directory to write batches to
    - unassigned: 'sample name' to associate with unassigned
        read (used as a basename for output file)

    In 'samples' mode assignment is done to samples only;
    in 'barcodes' mode assignment is done to samples and
    barcodes.

    Arguments:
      args (list): list containing the arguments supplied to
        the read assigner

    Returns:
      Tuple: tuple consisting of (batch id,barcode_counts,
        unassigned_barcodes_file).
    """
    # Unpack arguments
    fastq_r1,fastq_r2,fastq_i1,fastq_i2,well_list_file,mode,swap_i1_and_i2,reverse_complement_index,rewrite_fastq_headers,working_dir,unassigned = args
    # Batch ID is the trailing part of the name
    batch_id = AnalysisFastq(fastq_i1).extras.strip('_')
    # Label is sample name plus batch name
    label = "%s/%s" % (AnalysisFastq(fastq_i1).sample_name,batch_id)
    report("[%s] Assigning reads from R1/R2 Fastq pairs based on I1/I2 Fastqs:"
           % label)
    report("[%s] -- R1: %s" % (label,os.path.basename(fastq_r1)))
    report("[%s] -- R2: %s" % (label,os.path.basename(fastq_r2)))
    report("[%s] -- I1: %s" % (label,os.path.basename(fastq_i1)))
    report("[%s] -- I2: %s" % (label,os.path.basename(fastq_i2)))
    report("[%s] -- Well list: %s" % (label,os.path.basename(well_list_file)))
    report("[%s] Mode is '%s'" % (label,mode))
    if swap_i1_and_i2:
        report("[%s] Swapping I1 and I2 Fastqs for matching to well list" %
               label)
    if rewrite_fastq_headers:
        report("[%s] Rewriting Fastq read headers to include well list "
               "barcodes" % label)
    # Check mode
    if mode not in ("samples","barcodes"):
        report("[%s] Unrecognised mode!" % label,fp=sys.stderr)
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    os.mkdir(os.path.join(working_dir,batch_id))
    # Read well list file to get barcodes and lookups
    well_list = ICell8WellList(well_list_file)
    sample_lookup = defaultdict(lambda: unassigned)
    barcode_lookup = defaultdict(lambda: unassigned)
    for sample in well_list.samples():
        barcode_lookup[sample] = list()
    barcodes = well_list.barcodes()
    for barcode in barcodes:
        sample = well_list.sample(barcode)
        sample_lookup[barcode] = sample
        barcode_lookup[sample].append(barcode)
    # Generate adjusted versions of barcodes for matching
    # against barcodes derived from Fastqs
    fastq_barcode_lookup = defaultdict(lambda: None)
    for barcode in barcodes:
        i1,i2 = barcode.split('+')
        if reverse_complement_index:
            if reverse_complement_index in ('i1','both'):
                # Reverse complement the I1 part of each barcode
                i1 = reverse_complement(i1)
            if reverse_complement_index in ('i2','both'):
                # Reverse complement the I2 part of each barcode
                i2 = reverse_complement(i2)
        if swap_i1_and_i2:
            i2,i1 = i1,i2
        fastq_barcode_lookup["%s+%s" % (i1,i2)] = barcode
    # File to write unassigned barcodes to
    unassigned_barcodes_file = os.path.join(working_dir,
                                            batch_id,
                                            "unassigned_barcodes.txt")
    # Set up output files for samples
    samples = well_list.samples()
    samples.insert(0,unassigned)
    fpp = BufferedOutputFiles()
    for read in ('R1','R2','I1','I2'):
        for index,sample in enumerate(samples):
            if mode == 'samples':
                # Output files will only have sample names
                name = "%s_%s" % (sample,read)
                filen = "%s_S%d_%s_001.fastq" % (sample,index,read)
                fpp.open(name,
                         os.path.join(working_dir,batch_id,filen))
            elif mode == 'barcodes':
                # Output files will have sample name plus barcode
                if sample != unassigned:
                    # Standard samples
                    for barcode in barcode_lookup[sample]:
                        name = "%s_%s_%s" % (sample,barcode,read)
                        filen = "%s_S%d_%s_%s_001.fastq" % \
                                (sample,index,barcode,read)
                        fpp.open(name,
                                 os.path.join(working_dir,batch_id,filen))
                else:
                    # Unassigned reads
                    name = "%s_%s" % (sample,read)
                    filen = "%s_S%d_%s_001.fastq" % (sample,index,read)
                    fpp.open(name,
                             os.path.join(working_dir,batch_id,filen))
    barcode_counts = { unassigned: 0, }
    for barcode in well_list.barcodes():
        barcode_counts[barcode] = 0
    # Examine indices and assign reads
    ii = 0
    progress = ProgressChecker(every=1000000)
    if mode == 'samples':
        # Assigning reads to samples
        with open(unassigned_barcodes_file,"w") as fp:
            for r1,r2,i1,i2 in zip(getreads(fastq_r1),
                                   getreads(fastq_r2),
                                   getreads(fastq_i1),
                                   getreads(fastq_i2)):
                # Get barcodes to match against adjusted
                # versions from well list
                fastq_barcode = "%s+%s" % (i1[1],i2[1])
                # Get "real" barcode
                barcode = fastq_barcode_lookup[fastq_barcode]
                # Add to counts
                try:
                    barcode_counts[barcode] += 1
                except KeyError:
                    barcode_counts[unassigned] += 1
                # Determine sample
                sample = sample_lookup[barcode]
                # Rewrite read headers to include well list barcode
                if rewrite_fastq_headers and barcode:
                    r1 = update_fastq_read_index(r1,barcode)
                    r2 = update_fastq_read_index(r2,barcode)
                    i1 = update_fastq_read_index(i1,barcode)
                    i2 = update_fastq_read_index(i2,barcode)
                # Write the reads to the appropriate destinations
                fpp.write("%s_R1" % sample,'\n'.join(r1))
                fpp.write("%s_R2" % sample,'\n'.join(r2))
                fpp.write("%s_I1" % sample,'\n'.join(i1))
                fpp.write("%s_I2" % sample,'\n'.join(i2))
                # Write Fastq version of unassigned barcode to file
                if sample == unassigned:
                    fp.write("%s\n" % fastq_barcode)
                # Report progress
                ii += 1
                if progress.check(ii):
                    report("[%s]...%d reads examined" % (label,ii))
    elif mode == 'barcodes':
        # Assigning reads to barcodes
        with open(unassigned_barcodes_file,"w") as fp:
            for r1,r2,i1,i2 in zip(getreads(fastq_r1),
                                   getreads(fastq_r2),
                                   getreads(fastq_i1),
                                   getreads(fastq_i2)):
                # Get barcodes to match against adjusted
                # versions from well list
                fastq_barcode = "%s+%s" % (i1[1],i2[1])
                # Get "real" barcode
                barcode = fastq_barcode_lookup[fastq_barcode]
                # Add to counts
                try:
                    barcode_counts[barcode] += 1
                except KeyError:
                    barcode_counts[unassigned] += 1
                # Determine sample
                sample = sample_lookup[barcode]
                # Rewrite read headers to include well list barcode
                if rewrite_fastq_headers and barcode:
                    r1 = update_fastq_read_index(r1,barcode)
                    r2 = update_fastq_read_index(r2,barcode)
                    i1 = update_fastq_read_index(i1,barcode)
                    i2 = update_fastq_read_index(i2,barcode)
                # Write the reads to the appropriate destinations
                if sample != unassigned:
                    # Assign to sample and barcode
                    fpp.write("%s_%s_R1" % (sample,barcode),'\n'.join(r1))
                    fpp.write("%s_%s_R2" % (sample,barcode),'\n'.join(r2))
                    fpp.write("%s_%s_I1" % (sample,barcode),'\n'.join(i1))
                    fpp.write("%s_%s_I2" % (sample,barcode),'\n'.join(i2))
                else:
                    # Write unassigned barcode to file
                    fpp.write("%s_R1" % sample,'\n'.join(r1))
                    fpp.write("%s_R2" % sample,'\n'.join(r2))
                    fpp.write("%s_I1" % sample,'\n'.join(i1))
                    fpp.write("%s_I2" % sample,'\n'.join(i2))
                    # Write Fastq version of unassigned barcode to file
                    fp.write("%s\n" % fastq_barcode)
                # Report progress
                ii += 1
                if progress.check(ii):
                    report("[%s]...%d reads examined" % (label,ii))
    report("[%s] Finished processing batch %s" % (label,batch_id))
    # Close files
    fpp.close()
    # Remove original files
    for fq in (fastq_r1,fastq_r2,fastq_i1,fastq_i2):
        report("[%s] Removing %s" % (label,fq))
        os.remove(fq)
    # Returns tuple with batch ID, barcode counts and
    # file with list of unassigned barcodes
    return (batch_id,barcode_counts,unassigned_barcodes_file)