Example #1
0
def get_fastq_lanes(fastq):
    """
    Return list of lanes present in Fastq file

    Arguments:
      fastq (str): path to Fastq file (can
        be gzipped)

    Returns:
      Tuple: tuple (n,lanes) where ``n`` is a the
        number of reads and ``lanes`` is a list
        of integer lane numbers.
    """
    regex = re.compile(r"^([^:]*:){3}(\d*):")
    nreads = 0
    lanes = set()
    for read in getreads(fastq):
        nreads += 1
        try:
            lane = regex.match(''.join(read)).group(2)
            lanes.add(int(lane))
        except AttributeError:
            raise Exception("Failed to find lane in read %s: "
                            "not a valid Fastq file?" % '\n'.join(read))
    return (nreads, sorted(list(lanes)))
def get_fastq_lanes(fastq):
    """
    Return list of lanes present in Fastq file

    Arguments:
      fastq (str): path to Fastq file (can
        be gzipped)

    Returns:
      Tuple: tuple (n,lanes) where ``n`` is a the
        number of reads and ``lanes`` is a list
        of integer lane numbers.
    """
    regex = re.compile(r"^([^:]*:){3}(\d*):")
    nreads = 0
    lanes = set()
    for read in getreads(fastq):
        nreads += 1
        try:
            lane = regex.match(''.join(read)).group(2)
            lanes.add(int(lane))
        except AttributeError:
            raise Exception("Failed to find lane in read %s: "
                            "not a valid Fastq file?"
                            % '\n'.join(read))
    return (nreads,sorted(list(lanes)))
Example #3
0
def fastq_strand(argv, working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file")
    p.add_argument("r2",
                   metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g",
                   "--genome",
                   dest="star_genomedirs",
                   metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o",
                   "--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c",
                   "--conf",
                   metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print("READ1\t: %s" % args.r1)
    print("READ2\t: %s" % args.r2)
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print("STAR\t: %s" % star_exe)
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print("Conf file\t: %s" % args.conf)
        star_genomedirs = []
        with open(args.conf, 'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name, star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print("Genomes:")
    for genome in star_genomedirs:
        print("- %s" % genome)
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" % outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir, os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print("Working directory: %s" % working_dir)
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print("%d reads" % nreads)
    if args.subset == 0:
        print("Using all read pairs in Fastq files")
        subset = nreads
    elif args.subset > nreads:
        print("Actual number of read pairs smaller than requested subset")
        subset = nreads
    else:
        subset = args.subset
        print("Using random subset of %d read pairs" % subset)
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads), subset)
    fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir, os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset, 'w') as fp:
            for read in getreads_subset(os.path.abspath(fq), subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(
            outdir, "STAR.%s.outputs" %
            os.path.basename(strip_ngs_extensions(args.r1)))
        print("Output from STAR will be copied to %s" % star_output_dir)
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir, i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir, backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory',
                '--genomeDir',
                os.path.abspath(star_genomedir)
            ])
            star_cmd.extend(['--readFilesIn', fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix',
                prefix, '--runThreadN',
                str(args.n)
            ])
            print("Running %s" % ' '.join(star_cmd))
            try:
                subprocess.check_output(star_cmd, cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep, "_"))
                print("Copying STAR outputs to %s" % genome_dir)
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir, f),
                                    os.path.join(genome_dir, f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i, line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print("Sums:")
            print("- col2: %d" % sum_col2)
            print("- col3: %d" % sum_col3)
            print("- col4: %d" % sum_col4)
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3) / float(sum_col2) * 100.0
                reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print("Strand percentages:")
            print("- 1st forward: %.2f%%" % forward_1st)
            print("- 2nd reverse: %.2f%%" % reverse_2nd)
            # Append to output file
            data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2, sum_col3, sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile, 'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__, "STAR", subset))
            columns = ["Genome", "1st forward", "2nd reverse"]
            if args.counts:
                columns.extend([
                    "Unstranded", "1st read strand aligned",
                    "2nd read strand aligned"
                ])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0
def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = optparse.OptionParser(usage="%prog -m PATTERN |-n NREADS infile "
                              "[ infile ... ]",
                              version="%prog "+__version__,
                              description=__description__)
    p.add_option('-m','--match',action='store',dest='pattern',default=None,
                 help="extract records that match Python regular "
                 "expression PATTERN")
    p.add_option('-n',action='store',dest='n',default=None,
                 help="extract N random reads from the input file(s). "
                 "If multiple files are supplied (e.g. R1/R2 pair) then "
                 "the same subsets will be extracted for each. "
                 "(Optionally a percentage can be supplied instead e.g. "
                 "'50%' to extract a subset of half the reads.)")
    p.add_option('-s','--seed',action='store',dest='seed',default=None,
                 help="specify seed for random number generator (used "
                 "for -n option; using the same seed should produce the "
                 "same 'random' sample of reads)")
    opts,args = p.parse_args(args)
    if len(args) < 1:
        p.error("Need to supply at least one input file")
    # Pattern matching option
    if opts.pattern is not None:
        if opts.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print "Extracting reads matching '%s'" % opts.pattern
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,opts.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if opts.seed is not None:
            random.seed(opts.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args[0]))
        print "Number of reads: %s" % nreads
        if len(args) > 1:
            print "Verifying read numbers match between files"
        for f in args[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print "Inconsistent numbers of reads between files"
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(opts.n)
        except ValueError:
            if str(opts.n).endswith('%'):
                nsubset = int(float(opts.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print "Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads)
            sys.exit(1)
        print "Generating set of %s random indices" % nsubset
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')
Example #5
0
def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = argparse.ArgumentParser(version="%(prog)s "+__version__,
                                description=__description__)
    p.add_argument('-m','--match',action='store',dest='pattern',
                   default=None,
                   help="extract records that match Python regular "
                   "expression PATTERN")
    p.add_argument('-n',action='store',dest='n',default=None,
                   help="extract N random reads from the input file(s). "
                   "If multiple files are supplied (e.g. R1/R2 pair) then "
                   "the same subsets will be extracted for each. "
                   "(Optionally a percentage can be supplied instead e.g. "
                   "'50%%' to extract a subset of half the reads.)")
    p.add_argument('-s','--seed',action='store',dest='seed',default=None,
                   help="specify seed for random number generator (used "
                   "for -n option; using the same seed should produce the "
                   "same 'random' sample of reads)")
    p.add_argument('infiles',metavar='infile',nargs='+',
                   help="input FASTQ, CSFASTA, or QUAL file")
    args = p.parse_args(args)
    # Pattern matching option
    if args.pattern is not None:
        if args.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print("Extracting reads matching '%s'" % args.pattern)
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,args.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if args.seed is not None:
            random.seed(args.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args.infiles[0]))
        print("Number of reads: %s" % nreads)
        if len(args.infiles) > 1:
            print("Verifying read numbers match between files")
        for f in args.infiles[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print("Inconsistent numbers of reads between files")
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(args.n)
        except ValueError:
            if str(args.n).endswith('%'):
                nsubset = int(float(args.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print("Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads))
            sys.exit(1)
        print("Generating set of %s random indices" % nsubset)
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')
def fastq_strand(argv,working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1",metavar="READ1",
                   default=None,
                   help="R1 Fastq file")
    p.add_argument("r2",metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g","--genome",
                   dest="star_genomedirs",metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o","--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c","--conf",metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print "READ1\t: %s" % args.r1
    print "READ2\t: %s" % args.r2
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print "STAR\t: %s" % star_exe
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print "Conf file\t: %s" % args.conf
        star_genomedirs = []
        with open(args.conf,'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name,star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print "Genomes:"
    for genome in star_genomedirs:
        print "- %s" % genome
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" %
                         outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir,
        os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print "Working directory: %s" % working_dir
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print "%d reads" % nreads
    if args.subset == 0:
        print "Using all read pairs in Fastq files"
        subset = nreads
    elif args.subset > nreads:
        print "Actual number of read pairs smaller than requested subset"
        subset = nreads
    else:
        subset = args.subset
        print "Using random subset of %d read pairs" % subset
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads),subset)
    fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir,
                                 os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset,'w') as fp:
            for read in getreads_subset(os.path.abspath(fq),
                                        subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(outdir,
                                       "STAR.%s.outputs" %
                                       os.path.basename(
                                           strip_ngs_extensions(args.r1)))
        print "Output from STAR will be copied to %s" % star_output_dir
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir,i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir,backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode','alignReads',
                '--genomeLoad','NoSharedMemory',
                '--genomeDir',os.path.abspath(star_genomedir)])
            star_cmd.extend(['--readFilesIn',
                             fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode','GeneCounts',
                '--outSAMtype','BAM','Unsorted',
                '--outSAMstrandField','intronMotif',
                '--outFileNamePrefix',prefix,
                '--runThreadN',str(args.n)])
            print "Running %s" % ' '.join(star_cmd)
            try:
                subprocess.check_output(star_cmd,cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep,"_"))
                print "Copying STAR outputs to %s" % genome_dir
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir,f),
                                    os.path.join(genome_dir,f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i,line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print "Sums:"
            print "- col2: %d" % sum_col2
            print "- col3: %d" % sum_col3
            print "- col4: %d" % sum_col4
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3)/float(sum_col2)*100.0
                reverse_2nd = float(sum_col4)/float(sum_col2)*100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print "Strand percentages:"
            print "- 1st forward: %.2f%%" % forward_1st
            print "- 2nd reverse: %.2f%%" % reverse_2nd
            # Append to output file
            data = [name,
                    "%.2f" % forward_1st,
                    "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2,sum_col3,sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile,'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__,
                                                  "STAR",
                                                  subset))
            columns = ["Genome","1st forward","2nd reverse"]
            if args.counts:
                columns.extend(["Unstranded",
                                "1st read strand aligned",
                                "2nd read strand aligned"])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0
def assign_reads(args):
    """
    Assign reads to samples from batched ICELL8 ATAC Fastqs

    Intended to be invoked via 'map' or similar function

    Arguments are supplied in a single list which should
    contain the following items:

    - R1 Fastq: path to R1 Fastq file
    - R2 Fastq: path to R2 Fastq file
    - I1 Fastq: path to I1 Fastq file
    - I2 Fastq: path to I2 Fastq file
    - well list: path to the well list file
    - mode: either 'samples' or 'barcodes'
    - swap_i1_and_i2: boolean indicating whether I1 and I2
      Fastqs should be swapped for matching
    - reverse_complement: either None, 'i1', 'i2' or both
    - rewrite_fastq_headers: boolean indicating whether to
      write the matching ICELL8 barcodes into the Fastq
      read headers on output
    - working_dir: working directory to write batches to
    - unassigned: 'sample name' to associate with unassigned
        read (used as a basename for output file)

    In 'samples' mode assignment is done to samples only;
    in 'barcodes' mode assignment is done to samples and
    barcodes.

    Arguments:
      args (list): list containing the arguments supplied to
        the read assigner

    Returns:
      Tuple: tuple consisting of (batch id,barcode_counts,
        unassigned_barcodes_file).
    """
    # Unpack arguments
    fastq_r1,fastq_r2,fastq_i1,fastq_i2,well_list_file,mode,swap_i1_and_i2,reverse_complement_index,rewrite_fastq_headers,working_dir,unassigned = args
    # Batch ID is the trailing part of the name
    batch_id = AnalysisFastq(fastq_i1).extras.strip('_')
    # Label is sample name plus batch name
    label = "%s/%s" % (AnalysisFastq(fastq_i1).sample_name,batch_id)
    report("[%s] Assigning reads from R1/R2 Fastq pairs based on I1/I2 Fastqs:"
           % label)
    report("[%s] -- R1: %s" % (label,os.path.basename(fastq_r1)))
    report("[%s] -- R2: %s" % (label,os.path.basename(fastq_r2)))
    report("[%s] -- I1: %s" % (label,os.path.basename(fastq_i1)))
    report("[%s] -- I2: %s" % (label,os.path.basename(fastq_i2)))
    report("[%s] -- Well list: %s" % (label,os.path.basename(well_list_file)))
    report("[%s] Mode is '%s'" % (label,mode))
    if swap_i1_and_i2:
        report("[%s] Swapping I1 and I2 Fastqs for matching to well list" %
               label)
    if rewrite_fastq_headers:
        report("[%s] Rewriting Fastq read headers to include well list "
               "barcodes" % label)
    # Check mode
    if mode not in ("samples","barcodes"):
        report("[%s] Unrecognised mode!" % label,fp=sys.stderr)
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    os.mkdir(os.path.join(working_dir,batch_id))
    # Read well list file to get barcodes and lookups
    well_list = ICell8WellList(well_list_file)
    sample_lookup = defaultdict(lambda: unassigned)
    barcode_lookup = defaultdict(lambda: unassigned)
    for sample in well_list.samples():
        barcode_lookup[sample] = list()
    barcodes = well_list.barcodes()
    for barcode in barcodes:
        sample = well_list.sample(barcode)
        sample_lookup[barcode] = sample
        barcode_lookup[sample].append(barcode)
    # Generate adjusted versions of barcodes for matching
    # against barcodes derived from Fastqs
    fastq_barcode_lookup = defaultdict(lambda: None)
    for barcode in barcodes:
        i1,i2 = barcode.split('+')
        if reverse_complement_index:
            if reverse_complement_index in ('i1','both'):
                # Reverse complement the I1 part of each barcode
                i1 = reverse_complement(i1)
            if reverse_complement_index in ('i2','both'):
                # Reverse complement the I2 part of each barcode
                i2 = reverse_complement(i2)
        if swap_i1_and_i2:
            i2,i1 = i1,i2
        fastq_barcode_lookup["%s+%s" % (i1,i2)] = barcode
    # File to write unassigned barcodes to
    unassigned_barcodes_file = os.path.join(working_dir,
                                            batch_id,
                                            "unassigned_barcodes.txt")
    # Set up output files for samples
    samples = well_list.samples()
    samples.insert(0,unassigned)
    fpp = BufferedOutputFiles()
    for read in ('R1','R2','I1','I2'):
        for index,sample in enumerate(samples):
            if mode == 'samples':
                # Output files will only have sample names
                name = "%s_%s" % (sample,read)
                filen = "%s_S%d_%s_001.fastq" % (sample,index,read)
                fpp.open(name,
                         os.path.join(working_dir,batch_id,filen))
            elif mode == 'barcodes':
                # Output files will have sample name plus barcode
                if sample != unassigned:
                    # Standard samples
                    for barcode in barcode_lookup[sample]:
                        name = "%s_%s_%s" % (sample,barcode,read)
                        filen = "%s_S%d_%s_%s_001.fastq" % \
                                (sample,index,barcode,read)
                        fpp.open(name,
                                 os.path.join(working_dir,batch_id,filen))
                else:
                    # Unassigned reads
                    name = "%s_%s" % (sample,read)
                    filen = "%s_S%d_%s_001.fastq" % (sample,index,read)
                    fpp.open(name,
                             os.path.join(working_dir,batch_id,filen))
    barcode_counts = { unassigned: 0, }
    for barcode in well_list.barcodes():
        barcode_counts[barcode] = 0
    # Examine indices and assign reads
    ii = 0
    progress = ProgressChecker(every=1000000)
    if mode == 'samples':
        # Assigning reads to samples
        with open(unassigned_barcodes_file,"w") as fp:
            for r1,r2,i1,i2 in zip(getreads(fastq_r1),
                                   getreads(fastq_r2),
                                   getreads(fastq_i1),
                                   getreads(fastq_i2)):
                # Get barcodes to match against adjusted
                # versions from well list
                fastq_barcode = "%s+%s" % (i1[1],i2[1])
                # Get "real" barcode
                barcode = fastq_barcode_lookup[fastq_barcode]
                # Add to counts
                try:
                    barcode_counts[barcode] += 1
                except KeyError:
                    barcode_counts[unassigned] += 1
                # Determine sample
                sample = sample_lookup[barcode]
                # Rewrite read headers to include well list barcode
                if rewrite_fastq_headers and barcode:
                    r1 = update_fastq_read_index(r1,barcode)
                    r2 = update_fastq_read_index(r2,barcode)
                    i1 = update_fastq_read_index(i1,barcode)
                    i2 = update_fastq_read_index(i2,barcode)
                # Write the reads to the appropriate destinations
                fpp.write("%s_R1" % sample,'\n'.join(r1))
                fpp.write("%s_R2" % sample,'\n'.join(r2))
                fpp.write("%s_I1" % sample,'\n'.join(i1))
                fpp.write("%s_I2" % sample,'\n'.join(i2))
                # Write Fastq version of unassigned barcode to file
                if sample == unassigned:
                    fp.write("%s\n" % fastq_barcode)
                # Report progress
                ii += 1
                if progress.check(ii):
                    report("[%s]...%d reads examined" % (label,ii))
    elif mode == 'barcodes':
        # Assigning reads to barcodes
        with open(unassigned_barcodes_file,"w") as fp:
            for r1,r2,i1,i2 in zip(getreads(fastq_r1),
                                   getreads(fastq_r2),
                                   getreads(fastq_i1),
                                   getreads(fastq_i2)):
                # Get barcodes to match against adjusted
                # versions from well list
                fastq_barcode = "%s+%s" % (i1[1],i2[1])
                # Get "real" barcode
                barcode = fastq_barcode_lookup[fastq_barcode]
                # Add to counts
                try:
                    barcode_counts[barcode] += 1
                except KeyError:
                    barcode_counts[unassigned] += 1
                # Determine sample
                sample = sample_lookup[barcode]
                # Rewrite read headers to include well list barcode
                if rewrite_fastq_headers and barcode:
                    r1 = update_fastq_read_index(r1,barcode)
                    r2 = update_fastq_read_index(r2,barcode)
                    i1 = update_fastq_read_index(i1,barcode)
                    i2 = update_fastq_read_index(i2,barcode)
                # Write the reads to the appropriate destinations
                if sample != unassigned:
                    # Assign to sample and barcode
                    fpp.write("%s_%s_R1" % (sample,barcode),'\n'.join(r1))
                    fpp.write("%s_%s_R2" % (sample,barcode),'\n'.join(r2))
                    fpp.write("%s_%s_I1" % (sample,barcode),'\n'.join(i1))
                    fpp.write("%s_%s_I2" % (sample,barcode),'\n'.join(i2))
                else:
                    # Write unassigned barcode to file
                    fpp.write("%s_R1" % sample,'\n'.join(r1))
                    fpp.write("%s_R2" % sample,'\n'.join(r2))
                    fpp.write("%s_I1" % sample,'\n'.join(i1))
                    fpp.write("%s_I2" % sample,'\n'.join(i2))
                    # Write Fastq version of unassigned barcode to file
                    fp.write("%s\n" % fastq_barcode)
                # Report progress
                ii += 1
                if progress.check(ii):
                    report("[%s]...%d reads examined" % (label,ii))
    report("[%s] Finished processing batch %s" % (label,batch_id))
    # Close files
    fpp.close()
    # Remove original files
    for fq in (fastq_r1,fastq_r2,fastq_i1,fastq_i2):
        report("[%s] Removing %s" % (label,fq))
        os.remove(fq)
    # Returns tuple with batch ID, barcode counts and
    # file with list of unassigned barcodes
    return (batch_id,barcode_counts,unassigned_barcodes_file)