Beispiel #1
0
def create_analysis_dir(project,
                        top_dir=None,
                        merge_replicates=False,
                        keep_names=False,
                        dry_run=False):
    """Create and populate analysis directory for an IlluminaProject

    Creates a new directory and populates either with links to FASTQ
    files, or with 'merged' FASTQ files created by concatenating
    multiple FASTQs for each sample (which can happen for multiplexed
    runs where samples are split across multiple lanes).

    Project directory names are made up of the project name and then
    the experiment type, or just the project name if experiment type
    is not set.

    Arguments:
      project   : populated IlluminaProject object
      top_dir   : parent directory to create analysis subdirectory
                  under. Defaults to cwd if not explicitly specified
      merge_replicates: if True then creates a single FASTQ file for
                  each sample by merging multiple FASTQs together
      keep_names: if True then links to FASTQ files will have the same
                  names as the original files; by default links use the
                  shortest unique name
      dry_run   : if True then report what would be done but don't
                  actually perform any action

    Returns:
      Name of the project directory.
    
    """
    project_dir = os.path.join(top_dir,project.full_name)
    print "Creating analysis directory for project '%s'..." % project.full_name
    # Check for & create directory
    if os.path.exists(project_dir):
        print "-> %s already exists" % project_dir
    else:
        print "Making analysis directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(project_dir,mode=0775)
    # Make an empty ScriptCode directory
    scriptcode_dir = os.path.join(project_dir,"ScriptCode")
    if os.path.exists(scriptcode_dir):
        print "'ScriptCode' directory %s already exists" % scriptcode_dir
    else:
        print "Making 'ScriptCode' directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(scriptcode_dir,mode=0775)
    # Check for & create links to fastq files
    if not merge_replicates:
        for sample in project.samples:
            fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq)
            for fastq in sample.fastq:
                fastq_file = os.path.join(sample.dirn,fastq)
                if keep_names:
                    fastq_ln = os.path.join(project_dir,fastq)
                else:
                    fastq_ln = os.path.join(project_dir,fastq_names[fastq])
                if os.path.exists(fastq_ln):
                    logging.error("Failed to link to %s: %s already exists" %
                                  (fastq_file,os.path.basename(fastq_ln)))
                else:
                    print "Linking to %s" % fastq
                    if not dry_run:
                        bcf_utils.mklink(fastq_file,fastq_ln,relative=True)
    else:
        # Merge files for replicates within each sample
        for sample in project.samples:
            replicates = {}
            # Gather replicates to be merged
            for fastq in sample.fastq:
                fastq_data = IlluminaData.IlluminaFastq(fastq)
                name = "%s_%s_R%d" % (fastq_data.sample_name,
                                      fastq_data.barcode_sequence,
                                      fastq_data.read_number)
                if name not in replicates:
                    replicates[name] = []
                replicates[name].append(os.path.join(sample.dirn,fastq))
                # Sort into order
                replicates[name].sort()
            # Report detected replicates
            print "Sample %s" % sample.name
            for name in replicates:
                print "\tReplicate '%s'" % name
                for fastq in replicates[name]:
                    print "\t\t%s" % fastq
            # Do the merge
            for name in replicates:
                merged_fastq = os.path.join(project_dir,name+'.fastq')
                bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name])
    # Return directory name
    return project_dir
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        project_name = project.name
        if project.expt_type is not None:
            project_name += "_%s" % project.expt_type
        project_dir = os.path.join(illumina_analysis_dir,project_name)
        print "Creating analysis directory for project '%s'..." % project_name
        # Check for & create directory
        if os.path.exists(project_dir):
            print "-> %s already exists" % project_dir
        else:
            print "Making analysis directory for %s" % project.name
            if not options.dry_run:
                bcf_utils.mkdir(project_dir,mode=0775)
        # Check for & create links to fastq files
        if not options.merge_replicates:
            for sample in project.samples:
                fastq_names = get_unique_fastqs(sample)
                for fastq in sample.fastq:
                    fastq_file = os.path.join(sample.dirn,fastq)
                    if options.keep_names:
                        fastq_ln = os.path.join(project_dir,fastq)
                    else:
                        fastq_ln = os.path.join(project_dir,fastq_names[fastq])
                    if os.path.exists(fastq_ln):
                        logging.error("Failed to link to %s: %s already exists" %
                                      (fastq_file,os.path.basename(fastq_ln)))
                    else:
                        print "Linking to %s" % fastq
Beispiel #3
0
    def buildAnalysisDirs(self,
                          top_dir=None,
                          dry_run=False,
                          link_type="relative",
                          naming_scheme="partial"):
        """Construct and populate analysis directories for the experiments

        For each defined experiment, create the required analysis directories
        and populate with links to the primary data files.

        Arguments:
          top_dir: if set then create the analysis directories as
            subdirs of the specified directory; otherwise operate in cwd
          dry_run: if True then only report the mkdir, ln etc operations that
            would be performed. Default is False (do perform the operations).
          link_type: type of link to use when linking to primary data, one of
            'relative' or 'absolute'.
          naming_scheme: naming scheme to use for links to primary data, one of
            'full' (same names as primary data files), 'partial' (cut-down version
            of the full name which excludes sample names - the default), or
            'minimal' (just the library name).
        """
        # Deal with top_dir
        if top_dir:
            if os.path.exists(top_dir):
                print "Directory %s already exists" % top_dir
            else:
                if not dry_run:
                    # Create top directory
                    print "Creating %s" % top_dir
                    bcf_utils.mkdir(top_dir, mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % top_dir
        # Type of link
        if link_type == 'absolute':
            use_relative_links = False
        else:
            use_relative_links = True
        # For each experiment, make and populate directory
        for expt in self.experiments:
            print "Experiment: %s %s %s/%s" % (expt.name, expt.type,
                                               expt.sample, expt.library)
            expt_dir = expt.dirname(top_dir)
            print "\tDir: %s" % expt_dir
            # Make directory
            if os.path.exists(expt_dir):
                logging.warning("Directory %s already exists" % expt_dir)
            else:
                if not dry_run:
                    # Create directory
                    bcf_utils.mkdir(expt_dir, mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % expt_dir
            # Locate the primary data
            for run in self.solid_runs:
                paired_end = SolidData.is_paired_end(run)
                libraries = run.fetchLibraries(expt.sample, expt.library)
                for library in libraries:
                    # Get names for links to primary data - F3
                    ln_csfasta, ln_qual = LinkNames(naming_scheme).names(
                        library)
                    print "\t\t%s" % ln_csfasta
                    print "\t\t%s" % ln_qual
                    # Make links to primary data
                    try:
                        self.__linkToFile(library.csfasta,
                                          os.path.join(expt_dir, ln_csfasta),
                                          relative=use_relative_links,
                                          dry_run=dry_run)
                        self.__linkToFile(library.qual,
                                          os.path.join(expt_dir, ln_qual),
                                          relative=use_relative_links,
                                          dry_run=dry_run)
                    except Exception, ex:
                        logging.error(
                            "Failed to link to some or all F3 primary data")
                        logging.error("Exception: %s" % ex)
                    # Get names for links to F5 reads (if paired-end run)
                    if paired_end:
                        ln_csfasta, ln_qual = LinkNames(naming_scheme).names(
                            library, F5=True)
                        print "\t\t%s" % ln_csfasta
                        print "\t\t%s" % ln_qual
                        # Make links to F5 read data
                        try:
                            self.__linkToFile(library.csfasta_f5,
                                              os.path.join(
                                                  expt_dir, ln_csfasta),
                                              relative=use_relative_links,
                                              dry_run=dry_run)
                            self.__linkToFile(library.qual_f5,
                                              os.path.join(expt_dir, ln_qual),
                                              relative=use_relative_links,
                                              dry_run=dry_run)
                        except Exception, ex:
                            logging.error(
                                "Failed to link to some or all F5 primary data"
                            )
                            logging.error("Exception: %s" % ex)
Beispiel #4
0
class ExperimentList:
    """Container for a collection of Experiments

    Experiments are created and added to the ExperimentList by calling
    the addExperiment method, which returns a new Experiment object.

    The calling subprogram then populates the Experiment properties as
    appropriate.

    Once all Experiments are defined the analysis directory can be
    constructed by calling the buildAnalysisDirs method, which creates
    directories and symbolic links to primary data according to the
    definition of each experiment.
    """
    def __init__(self, solid_run_dir=None):
        """Create a new ExperimentList instance.

        Arguments:
          solid_run_dir: (optional) the path of the source SOLiD run
            directory.
        """
        self.experiments = []
        self.solid_run_dir = solid_run_dir
        self.solid_runs = []
        self.__getSolidRunData()

    def __getSolidRunData(self):
        """Get data about SOLiD runs

        Internal function to construct SolidRun objects based on the
        supplied SOLiD run directory.
        """
        if self.solid_run_dir is not None:
            logging.debug("Acquiring run information")
            for solid_dir in (self.solid_run_dir, self.solid_run_dir + "_2"):
                logging.debug("Examining %s" % solid_dir)
                run = SolidData.SolidRun(solid_dir)
                if not run:
                    logging.debug("Unable to get run data for %s" % solid_dir)
                else:
                    self.solid_runs.append(run)
            if len(self.solid_runs) == 0:
                logging.warning("No run data found")

    def addExperiment(self, name):
        """Create a new Experiment and add to the list

        Arguments:
          name: the name of the new experiment

        Returns:
          New Experiment object with name already set
        """
        new_expt = Experiment()
        new_expt.name = name
        self.experiments.append(new_expt)
        return new_expt

    def addDuplicateExperiment(self, expt):
        """Duplicate an existing Experiment and add to the list

        Arguments:
          expt: an existing Experiment object

        Returns:
          New Experiment object with the same data as the input
        """
        new_expt = expt.copy()
        self.experiments.append(new_expt)
        return new_expt

    def getLastExperiment(self):
        """Return the last Experiment added to the list
        """
        try:
            return self.experiments[-1]
        except IndexError:
            return None

    def buildAnalysisDirs(self,
                          top_dir=None,
                          dry_run=False,
                          link_type="relative",
                          naming_scheme="partial"):
        """Construct and populate analysis directories for the experiments

        For each defined experiment, create the required analysis directories
        and populate with links to the primary data files.

        Arguments:
          top_dir: if set then create the analysis directories as
            subdirs of the specified directory; otherwise operate in cwd
          dry_run: if True then only report the mkdir, ln etc operations that
            would be performed. Default is False (do perform the operations).
          link_type: type of link to use when linking to primary data, one of
            'relative' or 'absolute'.
          naming_scheme: naming scheme to use for links to primary data, one of
            'full' (same names as primary data files), 'partial' (cut-down version
            of the full name which excludes sample names - the default), or
            'minimal' (just the library name).
        """
        # Deal with top_dir
        if top_dir:
            if os.path.exists(top_dir):
                print "Directory %s already exists" % top_dir
            else:
                if not dry_run:
                    # Create top directory
                    print "Creating %s" % top_dir
                    bcf_utils.mkdir(top_dir, mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % top_dir
        # Type of link
        if link_type == 'absolute':
            use_relative_links = False
        else:
            use_relative_links = True
        # For each experiment, make and populate directory
        for expt in self.experiments:
            print "Experiment: %s %s %s/%s" % (expt.name, expt.type,
                                               expt.sample, expt.library)
            expt_dir = expt.dirname(top_dir)
            print "\tDir: %s" % expt_dir
            # Make directory
            if os.path.exists(expt_dir):
                logging.warning("Directory %s already exists" % expt_dir)
            else:
                if not dry_run:
                    # Create directory
                    bcf_utils.mkdir(expt_dir, mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % expt_dir
            # Locate the primary data
            for run in self.solid_runs:
                paired_end = SolidData.is_paired_end(run)
                libraries = run.fetchLibraries(expt.sample, expt.library)
                for library in libraries:
                    # Get names for links to primary data - F3
                    ln_csfasta, ln_qual = LinkNames(naming_scheme).names(
                        library)
                    print "\t\t%s" % ln_csfasta
                    print "\t\t%s" % ln_qual
                    # Make links to primary data
                    try:
                        self.__linkToFile(library.csfasta,
                                          os.path.join(expt_dir, ln_csfasta),
                                          relative=use_relative_links,
                                          dry_run=dry_run)
                        self.__linkToFile(library.qual,
                                          os.path.join(expt_dir, ln_qual),
                                          relative=use_relative_links,
                                          dry_run=dry_run)
                    except Exception, ex:
                        logging.error(
                            "Failed to link to some or all F3 primary data")
                        logging.error("Exception: %s" % ex)
                    # Get names for links to F5 reads (if paired-end run)
                    if paired_end:
                        ln_csfasta, ln_qual = LinkNames(naming_scheme).names(
                            library, F5=True)
                        print "\t\t%s" % ln_csfasta
                        print "\t\t%s" % ln_qual
                        # Make links to F5 read data
                        try:
                            self.__linkToFile(library.csfasta_f5,
                                              os.path.join(
                                                  expt_dir, ln_csfasta),
                                              relative=use_relative_links,
                                              dry_run=dry_run)
                            self.__linkToFile(library.qual_f5,
                                              os.path.join(expt_dir, ln_qual),
                                              relative=use_relative_links,
                                              dry_run=dry_run)
                        except Exception, ex:
                            logging.error(
                                "Failed to link to some or all F5 primary data"
                            )
                            logging.error("Exception: %s" % ex)
            # Make an empty ScriptCode directory
            scriptcode_dir = os.path.join(expt_dir, "ScriptCode")
            if os.path.exists(scriptcode_dir):
                logging.warning("Directory %s already exists" % scriptcode_dir)
            else:
                if not dry_run:
                    # Create directory
                    bcf_utils.mkdir(scriptcode_dir, mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % scriptcode_dir
Beispiel #5
0
    def buildAnalysisDirs(self,top_dir=None,dry_run=False,link_type="relative",
                          naming_scheme="partial"):
        """Construct and populate analysis directories for the experiments

        For each defined experiment, create the required analysis directories
        and populate with links to the primary data files.

        Arguments:
          top_dir: if set then create the analysis directories as
            subdirs of the specified directory; otherwise operate in cwd
          dry_run: if True then only report the mkdir, ln etc operations that
            would be performed. Default is False (do perform the operations).
          link_type: type of link to use when linking to primary data, one of
            'relative' or 'absolute'.
          naming_scheme: naming scheme to use for links to primary data, one of
            'full' (same names as primary data files), 'partial' (cut-down version
            of the full name which excludes sample names - the default), or
            'minimal' (just the library name).
        """
        # Deal with top_dir
        if top_dir:
            if os.path.exists(top_dir):
                print "Directory %s already exists" % top_dir
            else:
                if not dry_run:
                    # Create top directory
                    print "Creating %s" % top_dir
                    bcf_utils.mkdir(top_dir,mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % top_dir
        # Type of link
        if link_type == 'absolute':
            use_relative_links = False
        else:
            use_relative_links = True
        # For each experiment, make and populate directory
        for expt in self.experiments:
            print "Experiment: %s %s %s/%s" % (expt.name,expt.type,expt.sample,expt.library)
            expt_dir = expt.dirname(top_dir)
            print "\tDir: %s" % expt_dir
            # Make directory
            if os.path.exists(expt_dir):
                logging.warning("Directory %s already exists" % expt_dir)
            else:
                if not dry_run:
                    # Create directory
                    bcf_utils.mkdir(expt_dir,mode=0775)
                else:
                    # Report what would have been done
                    print "mkdir %s" % expt_dir
            # Locate the primary data
            for run in self.solid_runs:
                paired_end = SolidData.is_paired_end(run)
                libraries = run.fetchLibraries(expt.sample,expt.library)
                for library in libraries:
                    # Get names for links to primary data - F3
                    ln_csfasta,ln_qual = LinkNames(naming_scheme).names(library)
                    print "\t\t%s" % ln_csfasta
                    print "\t\t%s" % ln_qual
                    # Make links to primary data
                    try:
                        self.__linkToFile(library.csfasta,os.path.join(expt_dir,ln_csfasta),
                                          relative=use_relative_links,dry_run=dry_run)
                        self.__linkToFile(library.qual,os.path.join(expt_dir,ln_qual),
                                          relative=use_relative_links,dry_run=dry_run)
                    except Exception, ex:
                        logging.error("Failed to link to some or all F3 primary data")
                        logging.error("Exception: %s" % ex)
                    # Get names for links to F5 reads (if paired-end run)
                    if paired_end:
                        ln_csfasta,ln_qual = LinkNames(naming_scheme).names(library,F5=True)
                        print "\t\t%s" % ln_csfasta
                        print "\t\t%s" % ln_qual
                        # Make links to F5 read data
                        try:
                            self.__linkToFile(library.csfasta_f5,os.path.join(expt_dir,ln_csfasta),
                                              relative=use_relative_links,dry_run=dry_run)
                            self.__linkToFile(library.qual_f5,os.path.join(expt_dir,ln_qual),
                                              relative=use_relative_links,dry_run=dry_run)
                        except Exception, ex:
                            logging.error("Failed to link to some or all F5 primary data")
                            logging.error("Exception: %s" % ex)