def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
p.add_argument('-v', '--verbose', action="store_true", dest="verbose", default=False, help="verbose output") p.add_argument('fastqs', metavar="FASTQ", nargs='+', help="Input FASTQ to concatenate") p.add_argument('fastq_out', metavar="FASTQ_OUT", help="Output FASTQ with concatenated reads") args = p.parse_args() # Sort out inputs if len(args.fastqs) < 2: p.error("Need to supply at least 2 input fastqs plus output name") # Check inputs exist for fq in args.fastqs: if not os.path.exists(fq): logging.critical("Input file '%s' not found" % fq) sys.exit(1) # Run the concatenation try: concatenate_fastq_files(args.fastq_out, args.fastqs, verbose=args.verbose) except Exception as ex: logging.critical("Failed with exception: %s" % ex) sys.exit(1)
if __name__ == "__main__": # Handle command line p = optparse.OptionParser( usage="%prog [OPTIONS] FASTQ [FASTQ...] FASTQ_OUT", description="Concatenate reads from one or more input Fastq files " "into a single new file FASTQ_OUT.", version=__version__) p.add_option('-v','--verbose', action="store_true",dest="verbose", default=False, help="verbose output") opts,args = p.parse_args() # Sort out inputs if len(args) < 2: p.error("Need to supply at least 2 input fastqs plus output name") fastq_out = args[-1] fastqs = args[:-1] # Check inputs exist for fq in fastqs: if not os.path.exists(fq): logging.critical("Input file '%s' not found" % fq) sys.exit(1) # Run the concatenation try: concatenate_fastq_files(fastq_out,fastqs, verbose=opts.verbose) except Exception as ex: logging.critical("Failed with exception: %s" % ex) sys.exit(1)
shutil.copy(fastq_file, dst) # Verify against sample sheet if options.sample_sheet is not None: if IlluminaData.verify_run_against_sample_sheet( illumina_data, options.sample_sheet): print "Verification against sample sheet '%s': OK" % \ options.sample_sheet status = 0 else: logging.error("Verification against sample sheet '%s': FAILED" % options.sample_sheet) status = 1 sys.exit(status) # Merge multiple fastqs in each sample if options.merge_fastqs: for project in illumina_data.projects: for sample in project.samples: for read in (1, 2): # Concatenate fastqs for this read fastq_merged = sample.name if sample.paired_end: fastq_merged += "_R%d" % read fastq_merged += ".fastq.gz" bcf_utils.concatenate_fastq_files(fastq_merged, sample.fastq_subset( read_number=read, full_path=True), bufsize=1024 * 1024)
# Verify against sample sheet if options.sample_sheet is not None: if IlluminaData.verify_run_against_sample_sheet(illumina_data,options.sample_sheet): print "Verification against sample sheet '%s': OK" % \ options.sample_sheet status = 0 else: logging.error("Verification against sample sheet '%s': FAILED" % options.sample_sheet) status = 1 sys.exit(status) # Merge multiple fastqs in each sample if options.merge_fastqs: for project in illumina_data.projects: for sample in project.samples: for read in (1,2): # Concatenate fastqs for this read fastq_merged = sample.name if sample.paired_end: fastq_merged += "_R%d" % read fastq_merged += ".fastq.gz" bcf_utils.concatenate_fastq_files(fastq_merged, sample.fastq_subset(read_number=read, full_path=True), bufsize=1024*1024)