def __init__(self, fastq): # Store name self.fastq = fastq # Basename and extension self.basename = strip_ngs_extensions(self.fastq) self.extension = self.fastq[len(self.basename):] self.basename = os.path.basename(self.basename) # Values that should be derived from the name # (should be set by subclass) self.sample_name = None self.sample_number = None self.barcode_sequence = None self.lane_number = None self.read_number = None self.set_number = None self.is_index_read = False
def fastq_strand_output(fastq): """ Generate name for fastq_strand.py output Given a Fastq file name, the output from fastq_strand.py will look like: - {FASTQ}_fastq_strand.txt Arguments: fastq (str): name of Fastq file Returns: tuple: fastq_strand.py output (without leading paths) """ return "%s_fastq_strand.txt" % strip_ngs_extensions( os.path.basename(fastq))
def fastqc_output(fastq): """ Generate name of FastQC outputs Given a Fastq file name, the outputs from FastQC will look like: - {FASTQ}_fastqc/ - {FASTQ}_fastqc.html - {FASTQ}_fastqc.zip Arguments: fastq (str): name of Fastq file Returns: tuple: FastQC outputs (without leading paths) """ base_name = "%s_fastqc" % strip_ngs_extensions(os.path.basename(fastq)) return (base_name, base_name + '.html', base_name + '.zip')
def fastqc_output(fastq): """ Generate name of FastQC outputs Given a Fastq file name, the outputs from FastQC will look like: - {FASTQ}_fastqc/ - {FASTQ}_fastqc.html - {FASTQ}_fastqc.zip Arguments: fastq (str): name of Fastq file Returns: tuple: FastQC outputs (without leading paths) """ base_name = "%s_fastqc" % strip_ngs_extensions(os.path.basename(fastq)) return (base_name,base_name+'.html',base_name+'.zip')
def fastq_screen_output(fastq, screen_name): """ Generate name of fastq_screen output files Given a Fastq file name and a screen name, the outputs from fastq_screen will look like: - {FASTQ}_{SCREEN_NAME}_screen.png - {FASTQ}_{SCREEN_NAME}_screen.txt Arguments: fastq (str): name of Fastq file screen_name (str): name of screen Returns: tuple: fastq_screen output names (without leading path) """ base_name = "%s_%s_screen" % (strip_ngs_extensions( os.path.basename(fastq)), str(screen_name)) return (base_name + '.png', base_name + '.txt')
def fastq_screen_output(fastq,screen_name): """ Generate name of fastq_screen output files Given a Fastq file name and a screen name, the outputs from fastq_screen will look like: - {FASTQ}_{SCREEN_NAME}_screen.png - {FASTQ}_{SCREEN_NAME}_screen.txt Arguments: fastq (str): name of Fastq file screen_name (str): name of screen Returns: tuple: fastq_screen output names (without leading path) """ base_name = "%s_%s_screen" % (strip_ngs_extensions(os.path.basename(fastq)), str(screen_name)) return (base_name+'.png',base_name+'.txt')
def fastq_strand(argv, working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2", metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g", "--genome", dest="star_genomedirs", metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o", "--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c", "--conf", metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print("READ1\t: %s" % args.r1) print("READ2\t: %s" % args.r2) # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print("STAR\t: %s" % star_exe) # Gather genome indices genome_names = {} if args.conf is not None: print("Conf file\t: %s" % args.conf) star_genomedirs = [] with open(args.conf, 'r') as fp: for line in fp: if line.startswith('#'): continue name, star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print("Genomes:") for genome in star_genomedirs: print("- %s" % genome) # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print("Working directory: %s" % working_dir) # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print("%d reads" % nreads) if args.subset == 0: print("Using all read pairs in Fastq files") subset = nreads elif args.subset > nreads: print("Actual number of read pairs smaller than requested subset") subset = nreads else: subset = args.subset print("Using random subset of %d read pairs" % subset) if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads), subset) fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset, 'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join( outdir, "STAR.%s.outputs" % os.path.basename(strip_ngs_extensions(args.r1))) print("Output from STAR will be copied to %s" % star_output_dir) # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir, i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir, backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory', '--genomeDir', os.path.abspath(star_genomedir) ]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix', prefix, '--runThreadN', str(args.n) ]) print("Running %s" % ' '.join(star_cmd)) try: subprocess.check_output(star_cmd, cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep, "_")) print("Copying STAR outputs to %s" % genome_dir) os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir, f), os.path.join(genome_dir, f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i, line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print("Sums:") print("- col2: %d" % sum_col2) print("- col3: %d" % sum_col3) print("- col4: %d" % sum_col4) if sum_col2 > 0.0: forward_1st = float(sum_col3) / float(sum_col2) * 100.0 reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print("Strand percentages:") print("- 1st forward: %.2f%%" % forward_1st) print("- 2nd reverse: %.2f%%" % reverse_2nd) # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2, sum_col3, sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile, 'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome", "1st forward", "2nd reverse"] if args.counts: columns.extend([ "Unstranded", "1st read strand aligned", "2nd read strand aligned" ]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0
def fastq_strand(argv,working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1",metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2",metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g","--genome", dest="star_genomedirs",metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o","--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c","--conf",metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print "READ1\t: %s" % args.r1 print "READ2\t: %s" % args.r2 # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print "STAR\t: %s" % star_exe # Gather genome indices genome_names = {} if args.conf is not None: print "Conf file\t: %s" % args.conf star_genomedirs = [] with open(args.conf,'r') as fp: for line in fp: if line.startswith('#'): continue name,star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print "Genomes:" for genome in star_genomedirs: print "- %s" % genome # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print "Working directory: %s" % working_dir # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print "%d reads" % nreads if args.subset == 0: print "Using all read pairs in Fastq files" subset = nreads elif args.subset > nreads: print "Actual number of read pairs smaller than requested subset" subset = nreads else: subset = args.subset print "Using random subset of %d read pairs" % subset if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads),subset) fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset,'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join(outdir, "STAR.%s.outputs" % os.path.basename( strip_ngs_extensions(args.r1))) print "Output from STAR will be copied to %s" % star_output_dir # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir,i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir,backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode','alignReads', '--genomeLoad','NoSharedMemory', '--genomeDir',os.path.abspath(star_genomedir)]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode','GeneCounts', '--outSAMtype','BAM','Unsorted', '--outSAMstrandField','intronMotif', '--outFileNamePrefix',prefix, '--runThreadN',str(args.n)]) print "Running %s" % ' '.join(star_cmd) try: subprocess.check_output(star_cmd,cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep,"_")) print "Copying STAR outputs to %s" % genome_dir os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir,f), os.path.join(genome_dir,f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i,line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print "Sums:" print "- col2: %d" % sum_col2 print "- col3: %d" % sum_col3 print "- col4: %d" % sum_col4 if sum_col2 > 0.0: forward_1st = float(sum_col3)/float(sum_col2)*100.0 reverse_2nd = float(sum_col4)/float(sum_col2)*100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print "Strand percentages:" print "- 1st forward: %.2f%%" % forward_1st print "- 2nd reverse: %.2f%%" % reverse_2nd # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2,sum_col3,sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile,'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome","1st forward","2nd reverse"] if args.counts: columns.extend(["Unstranded", "1st read strand aligned", "2nd read strand aligned"]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0