def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: project_name = project.name if project.expt_type is not None: project_name += "_%s" % project.expt_type project_dir = os.path.join(illumina_analysis_dir,project_name) print "Creating analysis directory for project '%s'..." % project_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not options.dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Check for & create links to fastq files if not options.merge_replicates: for sample in project.samples: fastq_names = get_unique_fastqs(sample) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if options.keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq
def buildAnalysisDirs(self, top_dir=None, dry_run=False, link_type="relative", naming_scheme="partial"): """Construct and populate analysis directories for the experiments For each defined experiment, create the required analysis directories and populate with links to the primary data files. Arguments: top_dir: if set then create the analysis directories as subdirs of the specified directory; otherwise operate in cwd dry_run: if True then only report the mkdir, ln etc operations that would be performed. Default is False (do perform the operations). link_type: type of link to use when linking to primary data, one of 'relative' or 'absolute'. naming_scheme: naming scheme to use for links to primary data, one of 'full' (same names as primary data files), 'partial' (cut-down version of the full name which excludes sample names - the default), or 'minimal' (just the library name). """ # Deal with top_dir if top_dir: if os.path.exists(top_dir): print "Directory %s already exists" % top_dir else: if not dry_run: # Create top directory print "Creating %s" % top_dir bcf_utils.mkdir(top_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % top_dir # Type of link if link_type == 'absolute': use_relative_links = False else: use_relative_links = True # For each experiment, make and populate directory for expt in self.experiments: print "Experiment: %s %s %s/%s" % (expt.name, expt.type, expt.sample, expt.library) expt_dir = expt.dirname(top_dir) print "\tDir: %s" % expt_dir # Make directory if os.path.exists(expt_dir): logging.warning("Directory %s already exists" % expt_dir) else: if not dry_run: # Create directory bcf_utils.mkdir(expt_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % expt_dir # Locate the primary data for run in self.solid_runs: paired_end = SolidData.is_paired_end(run) libraries = run.fetchLibraries(expt.sample, expt.library) for library in libraries: # Get names for links to primary data - F3 ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to primary data try: self.__linkToFile(library.csfasta, os.path.join(expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F3 primary data") logging.error("Exception: %s" % ex) # Get names for links to F5 reads (if paired-end run) if paired_end: ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library, F5=True) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to F5 read data try: self.__linkToFile(library.csfasta_f5, os.path.join( expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual_f5, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F5 primary data" ) logging.error("Exception: %s" % ex)
class ExperimentList: """Container for a collection of Experiments Experiments are created and added to the ExperimentList by calling the addExperiment method, which returns a new Experiment object. The calling subprogram then populates the Experiment properties as appropriate. Once all Experiments are defined the analysis directory can be constructed by calling the buildAnalysisDirs method, which creates directories and symbolic links to primary data according to the definition of each experiment. """ def __init__(self, solid_run_dir=None): """Create a new ExperimentList instance. Arguments: solid_run_dir: (optional) the path of the source SOLiD run directory. """ self.experiments = [] self.solid_run_dir = solid_run_dir self.solid_runs = [] self.__getSolidRunData() def __getSolidRunData(self): """Get data about SOLiD runs Internal function to construct SolidRun objects based on the supplied SOLiD run directory. """ if self.solid_run_dir is not None: logging.debug("Acquiring run information") for solid_dir in (self.solid_run_dir, self.solid_run_dir + "_2"): logging.debug("Examining %s" % solid_dir) run = SolidData.SolidRun(solid_dir) if not run: logging.debug("Unable to get run data for %s" % solid_dir) else: self.solid_runs.append(run) if len(self.solid_runs) == 0: logging.warning("No run data found") def addExperiment(self, name): """Create a new Experiment and add to the list Arguments: name: the name of the new experiment Returns: New Experiment object with name already set """ new_expt = Experiment() new_expt.name = name self.experiments.append(new_expt) return new_expt def addDuplicateExperiment(self, expt): """Duplicate an existing Experiment and add to the list Arguments: expt: an existing Experiment object Returns: New Experiment object with the same data as the input """ new_expt = expt.copy() self.experiments.append(new_expt) return new_expt def getLastExperiment(self): """Return the last Experiment added to the list """ try: return self.experiments[-1] except IndexError: return None def buildAnalysisDirs(self, top_dir=None, dry_run=False, link_type="relative", naming_scheme="partial"): """Construct and populate analysis directories for the experiments For each defined experiment, create the required analysis directories and populate with links to the primary data files. Arguments: top_dir: if set then create the analysis directories as subdirs of the specified directory; otherwise operate in cwd dry_run: if True then only report the mkdir, ln etc operations that would be performed. Default is False (do perform the operations). link_type: type of link to use when linking to primary data, one of 'relative' or 'absolute'. naming_scheme: naming scheme to use for links to primary data, one of 'full' (same names as primary data files), 'partial' (cut-down version of the full name which excludes sample names - the default), or 'minimal' (just the library name). """ # Deal with top_dir if top_dir: if os.path.exists(top_dir): print "Directory %s already exists" % top_dir else: if not dry_run: # Create top directory print "Creating %s" % top_dir bcf_utils.mkdir(top_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % top_dir # Type of link if link_type == 'absolute': use_relative_links = False else: use_relative_links = True # For each experiment, make and populate directory for expt in self.experiments: print "Experiment: %s %s %s/%s" % (expt.name, expt.type, expt.sample, expt.library) expt_dir = expt.dirname(top_dir) print "\tDir: %s" % expt_dir # Make directory if os.path.exists(expt_dir): logging.warning("Directory %s already exists" % expt_dir) else: if not dry_run: # Create directory bcf_utils.mkdir(expt_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % expt_dir # Locate the primary data for run in self.solid_runs: paired_end = SolidData.is_paired_end(run) libraries = run.fetchLibraries(expt.sample, expt.library) for library in libraries: # Get names for links to primary data - F3 ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to primary data try: self.__linkToFile(library.csfasta, os.path.join(expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F3 primary data") logging.error("Exception: %s" % ex) # Get names for links to F5 reads (if paired-end run) if paired_end: ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library, F5=True) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to F5 read data try: self.__linkToFile(library.csfasta_f5, os.path.join( expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual_f5, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F5 primary data" ) logging.error("Exception: %s" % ex) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(expt_dir, "ScriptCode") if os.path.exists(scriptcode_dir): logging.warning("Directory %s already exists" % scriptcode_dir) else: if not dry_run: # Create directory bcf_utils.mkdir(scriptcode_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % scriptcode_dir
def buildAnalysisDirs(self,top_dir=None,dry_run=False,link_type="relative", naming_scheme="partial"): """Construct and populate analysis directories for the experiments For each defined experiment, create the required analysis directories and populate with links to the primary data files. Arguments: top_dir: if set then create the analysis directories as subdirs of the specified directory; otherwise operate in cwd dry_run: if True then only report the mkdir, ln etc operations that would be performed. Default is False (do perform the operations). link_type: type of link to use when linking to primary data, one of 'relative' or 'absolute'. naming_scheme: naming scheme to use for links to primary data, one of 'full' (same names as primary data files), 'partial' (cut-down version of the full name which excludes sample names - the default), or 'minimal' (just the library name). """ # Deal with top_dir if top_dir: if os.path.exists(top_dir): print "Directory %s already exists" % top_dir else: if not dry_run: # Create top directory print "Creating %s" % top_dir bcf_utils.mkdir(top_dir,mode=0775) else: # Report what would have been done print "mkdir %s" % top_dir # Type of link if link_type == 'absolute': use_relative_links = False else: use_relative_links = True # For each experiment, make and populate directory for expt in self.experiments: print "Experiment: %s %s %s/%s" % (expt.name,expt.type,expt.sample,expt.library) expt_dir = expt.dirname(top_dir) print "\tDir: %s" % expt_dir # Make directory if os.path.exists(expt_dir): logging.warning("Directory %s already exists" % expt_dir) else: if not dry_run: # Create directory bcf_utils.mkdir(expt_dir,mode=0775) else: # Report what would have been done print "mkdir %s" % expt_dir # Locate the primary data for run in self.solid_runs: paired_end = SolidData.is_paired_end(run) libraries = run.fetchLibraries(expt.sample,expt.library) for library in libraries: # Get names for links to primary data - F3 ln_csfasta,ln_qual = LinkNames(naming_scheme).names(library) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to primary data try: self.__linkToFile(library.csfasta,os.path.join(expt_dir,ln_csfasta), relative=use_relative_links,dry_run=dry_run) self.__linkToFile(library.qual,os.path.join(expt_dir,ln_qual), relative=use_relative_links,dry_run=dry_run) except Exception, ex: logging.error("Failed to link to some or all F3 primary data") logging.error("Exception: %s" % ex) # Get names for links to F5 reads (if paired-end run) if paired_end: ln_csfasta,ln_qual = LinkNames(naming_scheme).names(library,F5=True) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to F5 read data try: self.__linkToFile(library.csfasta_f5,os.path.join(expt_dir,ln_csfasta), relative=use_relative_links,dry_run=dry_run) self.__linkToFile(library.qual_f5,os.path.join(expt_dir,ln_qual), relative=use_relative_links,dry_run=dry_run) except Exception, ex: logging.error("Failed to link to some or all F5 primary data") logging.error("Exception: %s" % ex)