def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
def create_directory(self, illumina_project=None, fastqs=None, fastq_dir=None, short_fastq_names=False, link_to_fastqs=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory corresponding to the AnalysisProject object, and optionally also populates with links to FASTQ files from a supplied IlluminaProject object. The directory structure it creates is: dir/ fastqs/ logs/ ScriptCode/ It also creates an info file with metadata about the project. Arguments: illumina_project: (optional) populated IlluminaProject object from which the analysis directory will be populated fastqs: (optional) list of fastq files to import fastq_dir: (optional) name of subdirectory to put fastq files into; defaults to 'fastqs' short_fastq_names: (optional) if True then transform fastq file names to be the shortest possible unique names; if False (default) then use the original fastq names link_to_fastqs: (optional) if True then make symbolic links to to the fastq files; if False (default) then make hard links """ logger.debug("Creating analysis directory for project '%s'" % self.name) # Check for & create directory if os.path.exists(self.dirn): logger.warning("Directory %s already exists" % self.dirn) else: logger.debug("Making analysis directory %s" % self.dirn) bcf_utils.mkdir(self.dirn, mode=0775) # Make a 'ScriptCode' directory scriptcode_dir = os.path.join(self.dirn, "ScriptCode") bcf_utils.mkdir(scriptcode_dir, mode=0775) # Put a file in ScriptCode to make sure it's # not pruned on subsequent rsync operations fp = open(os.path.join(self.dirn, 'ScriptCode', 'README.txt'), 'w') fp.write( "The ScriptCode directory is a place to put custom scripts and programs" ) fp.close() # Make a 'fastqs' directory if fastq_dir is None: fastq_dir = "fastqs" fastq_dir = os.path.join(self.dirn, fastq_dir) bcf_utils.mkdir(fastq_dir, mode=0775) # Check for & create links to fastq files if fastqs is None: # Make a list of fastqs to import from the supplied # IlluminaProject object fastqs = [] if illumina_project is not None: for sample in illumina_project.samples: for fastq in sample.fastq: fastqs.append(os.path.join(sample.dirn, fastq)) if short_fastq_names: # Get mapping to (shortened) unique names fastq_names = IlluminaData.get_unique_fastq_names(fastqs) else: # Use full names fastq_names = {} for fq in fastqs: fastq_names[fq] = os.path.basename(fq) for fastq in fastqs: target_fq = os.path.join(fastq_dir, fastq_names[fastq]) if os.path.exists(target_fq): logger.warning("Target '%s' already exists" % target_fq) else: if link_to_fastqs: logger.debug("Making symlink to %s" % fastq) bcf_utils.mklink(fastq, target_fq, relative=True) else: logger.debug("Making hard link to %s" % fastq) os.link(fastq, target_fq) # Populate self.populate(fastq_dir=os.path.basename(fastq_dir)) # Update metadata: primary fastq dir self.info['primary_fastq_dir'] = os.path.relpath(fastq_dir, self.dirn) # Update metadata: sample summary self.info['samples'] = self.sample_summary() # Save metadata self.info.save(self.info_file)