Python PipelineSettings Examples

Programming Language: Python

Namespace/Package Name: settings

Class/Type: PipelineSettings

Examples at hotexamples.com: 4

Python PipelineSettings - 4 examples found. These are the top rated real world Python examples of settings.PipelineSettings extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PipelineSettings(1)

write_settings_cfg(1)

Example #1

Show file

File: project.py Project: kmhernan/ExScaliburSMD

    def initialize_project(self):
        '''Wrapper for building the project.
          
           1. Load the settings
           2. Load the assembly
           3. Load the samples
           4. Set the directory structure
        '''
        self.logger.info('Project Name: {0.name}'.format(self))

        # Settings
        self.settings = PipelineSettings(self.args, self.PATH)
        self.logger.info('Baseline Threads: {0.threads}'.format(self.settings))
        self.logger.info('Clip Adapters: {0.clipping}'.format(self.settings))
        if self.settings.clipping:
            self.logger.info('Minimum read length: {0.min_length}'.format(
                self.settings))
        self.logger.info('Minimum MAPQ: {0.min_map_q}'.format(self.settings))
        self.logger.info('Aligners: {0}'.format(self.settings.aln_list))
        self.logger.info('Somatic Mutation Detectors: {0}'.format(
            self.settings.smd_list))

        # Assembly info
        self._set_assembly()
        self.logger.info('Reference Name: {0}'.format(
            self.assembly['refname']))

        # Samples
        self.samples.load_samples(os.path.abspath(self.args.metadata),
                                  self.PATH, self.name)

        # File structure
        self._set_project_files()

Example #2

Show file

File: project.py Project: cribioinfo/ExScaliburSMD

    def initialize_project(self):
        """Wrapper for building the project.
          
           1. Load the settings
           2. Load the assembly
           3. Load the samples
           4. Set the directory structure
        """
        self.logger.info("Project Name: {0.name}".format(self))

        # Settings
        self.settings = PipelineSettings(self.args, self.PATH)
        self.logger.info("Baseline Threads: {0.threads}".format(self.settings))
        self.logger.info("Clip Adapters: {0.clipping}".format(self.settings))
        if self.settings.clipping:
            self.logger.info("Minimum read length: {0.min_length}".format(self.settings))
        self.logger.info("Minimum MAPQ: {0.min_map_q}".format(self.settings))
        self.logger.info("Aligners: {0}".format(self.settings.aln_list))
        self.logger.info("Somatic Mutation Detectors: {0}".format(self.settings.smd_list))

        # Assembly info
        self._set_assembly()
        self.logger.info("Reference Name: {0}".format(self.assembly["refname"]))

        # Samples
        self.samples.load_samples(os.path.abspath(self.args.metadata), self.PATH, self.name)

        # File structure
        self._set_project_files()

Example #3

Show file

File: project.py Project: kmhernan/ExScaliburSMD

class Project(object):
    '''Object representing a Tumor/Normal Exome Project

       -- args - the argparser object
       -- PATH - the path to the ExScaliburSMD application directory
       -- assembly - the assembly dictionary from the assembly yaml file
       -- settings - the PipelineSettings object
       -- files    - dictionary of main project paths
       -- samples  - the SampleFactory object
       -- name - the project ID
    '''
    def __init__(self, args, PATH):
        self.logger = logging.getLogger('SMD.Project')
        self.args = args
        self.PATH = PATH
        self.assembly = None
        self.settings = None
        self.files = None
        self.name = args.project_id
        self.samples = SampleFactory()
        self.final_reporter_yaml = None
        self.bds_file = None

        # Lists holding the job submission commands to write to the
        # submission scripts
        self.preprocessing_jobs = []
        self.alignments_jobs = []
        self.gatk_jobs = []
        self.somatic_jobs = []
        self.annotation_jobs = []
        self.report_jobs = []
        self.config_list = []

    def initialize_project(self):
        '''Wrapper for building the project.
          
           1. Load the settings
           2. Load the assembly
           3. Load the samples
           4. Set the directory structure
        '''
        self.logger.info('Project Name: {0.name}'.format(self))

        # Settings
        self.settings = PipelineSettings(self.args, self.PATH)
        self.logger.info('Baseline Threads: {0.threads}'.format(self.settings))
        self.logger.info('Clip Adapters: {0.clipping}'.format(self.settings))
        if self.settings.clipping:
            self.logger.info('Minimum read length: {0.min_length}'.format(
                self.settings))
        self.logger.info('Minimum MAPQ: {0.min_map_q}'.format(self.settings))
        self.logger.info('Aligners: {0}'.format(self.settings.aln_list))
        self.logger.info('Somatic Mutation Detectors: {0}'.format(
            self.settings.smd_list))

        # Assembly info
        self._set_assembly()
        self.logger.info('Reference Name: {0}'.format(
            self.assembly['refname']))

        # Samples
        self.samples.load_samples(os.path.abspath(self.args.metadata),
                                  self.PATH, self.name)

        # File structure
        self._set_project_files()

    def configure_pipeline(self):
        '''Wrapper for creating the configuration files for each sample'''
        self.settings.write_settings_cfg(self)
        for s in self.samples:
            s.write_sample_bds(self, self.settings)
            s.write_sample_report_yaml()
            self.config_list.append(s.sample_cfg)
        self.__write_reporter_yaml()
        self.__write_bds_config()
        self.__write_project_run_script()

    def _set_assembly(self):
        '''Grabs the yaml file, validates it, and returns the dictionary'''
        # Load
        in_fh = open(self.args.config_file, 'rU')
        chk = yaml.safe_load(in_fh)['reference']
        in_fh.close()

        # Check the file
        refname = chk.keys()[0]
        required = [
            'cosmic', 'exons_bed', 'knowndb', 'knownindel', 'referenceseq',
            'sequence_dictionary'
        ]
        if 'novoalign' in self.settings.aln_list:
            required.append('novoalign_index')
        for i in required:
            assert i in chk[refname], self.logger.error(
                "Required refrence section '%s' missing" % i)
            if i in [
                    'cosmic', 'exons_bed', 'knowndb', 'knownindel',
                    'referenceseq', 'sequence_dictionary'
            ]:
                try:
                    fp = open(chk[refname][i])
                except IOError as e:
                    raise IOError('Unable to open {0} file: {1}'.format(
                        i, chk[refname][i]))
                else:
                    fp.close()

        # Set refname and assign the dictionary to the member
        chk[refname]['refname'] = refname
        self.assembly = chk[refname]

    def _set_project_files(self):
        '''Sets the project path, log, jobs, config, and results paths for the project.

           project_path - parent directory for this project
           config_path - parent directory for all config files within this project
           job_path - parent directory for all job files within this project
           log_path - parent directory for all log files within this project
           results_path - parent directory for all results files within this project
           report_path - parent directory for all report files within this project
        '''
        self.files = {
            'project_path':
            os.path.abspath(self.args.output_directory),
            'config_path':
            os.path.join(os.path.abspath(self.args.output_directory),
                         'config'),
            'log_path':
            os.path.join(os.path.abspath(self.args.output_directory), 'logs'),
            'results_path':
            os.path.join(os.path.abspath(self.args.output_directory),
                         'results'),
            'report_path':
            os.path.join(os.path.abspath(self.args.output_directory), 'report')
        }
        [check_dir(i) for i in self.files.values()]

    def __write_bds_config(self):
        '''Write the default BDS config file for an amazon cloud run'''
        self.bds_file = os.path.join(
            os.path.abspath(self.args.output_directory), 'bds.config')
        with open(self.bds_file, 'wb') as o:
            o.write('# BDS configuration file\n')
            o.write(
                '# This is set up to work easily with STAR cluster on EC2\n')
            o.write(
                '# Please see http://pcingola.github.io/BigDataScript/bigDataScript_manual.html#config\n'
            )
            o.write('# for more information.\n\n')

            o.write('# Easy way to change mem usage for all jobs\n')
            o.write('#mem = -1\n\n')

            o.write('# Easy way to declare a particular node for all jobs\n')
            o.write('#node = my_node\n\n')

            o.write('# Easy way to declare a particular queue for all jobs\n')
            o.write('#queue = my_queue\n\n')

            o.write(
                '# Easy way to declare the timeout for all jobs (seconds). It is 24 hours by default\n'
            )
            o.write('#timeout = 86400\n\n')

            o.write(
                '# Number of times a failed job can be retried. We find 0 is the safest.\n'
            )
            o.write('retry = 0\n\n')

            o.write(
                '# Sometimes many qsub commands is not able to be handled on some systems.\n'
            )
            o.write(
                '# Use this to set the number of milliseconds to wait after qsub.\n'
            )
            o.write('waitAfterTaskRun = 200\n\n')

            o.write('# Use bash shell for tasks\n')
            o.write('taskShell = /bin/bash -e\n\n')

            o.write('# SGE parallel environment\n')
            o.write('sge.pe = smp\n\n')

            o.write(
                '# SGE mem argument. We set mem_free as consumable, jobs will wait, but it does not\n'
            )
            o.write(
                '# guarantee that memory will be available (e.g., it will not kill jobs that use more\n'
            )
            o.write(
                '# memory than requested. You can change memory usage for each tool in the ExScalibur configs\n'
            )
            o.write('sge.mem = mem_free\n\n')

            o.write('# SGE timeout variable\n')
            o.write('sge.timeout = h_rt\n\n')

    def __write_project_run_script(self):
        '''The script for running ExScalibur'''
        bds_script = os.path.join(self.PATH, 'ExScaliburSMD-run.bds')
        main_job = os.path.join(self.files['project_path'],
                                '{0.name}.exscalibur-smd.sh'.format(self))
        bds_log = os.path.join(self.files['project_path'], 'BDS-System.logs')
        cri_log = os.path.join(self.files['project_path'], 'CRI-Info.logs')

        # Create the job script
        with open(main_job, 'wb') as o:
            o.write('#!/bin/bash\n')
            o.write('\n')
            o.write('cd {0}\n\n'.format(self.files['project_path']))

            if self.settings.system['module_source']:
                o.write('. {0}\n'.format(
                    self.settings.system['module_source']))
            if self.settings.system['java']['use_module']:
                o.write('module load {0}\n\n'.format(
                    self.settings.system['java']['module']['name']))
            o.write('# Command for running the ExScalibur-SMD pipeline\n\n')
            o.write('bds -c ' + self.bds_file + ' -reportHtml -reportYaml -v -log -s sge ' + \
                    '{0} -in {1} -options {2} > {3} 2> {4}\n'.format(
                bds_script, " ".join(self.config_list), self.settings.settings_cfg,
                cri_log, bds_log))

        # Make user executable
        import stat
        st = os.stat(main_job)
        os.chmod(main_job, st.st_mode | stat.S_IEXEC)

    ###################################
    ## Reporter functions
    def __write_reporter_yaml(self):
        '''Writes the pipeline reporter yaml file'''
        self.final_reporter_yaml = os.path.join(
            self.files['report_path'],
            '{0.name}.exscalibur.smd.yaml'.format(self))
        dic = {'data': [], 'project': self.name}
        for s in self.samples:
            curr = s.reporter_obj
            dic['data'].append({
                'fastqc': self.__reporter_fastqc(curr),
                'alignments': self.__reporter_alignments(curr),
                'somatic': self.__reporter_somatic(curr),
                'sample': s.name
            })
        with open(self.final_reporter_yaml, 'wb') as o:
            yaml.safe_dump(dic, o)

    def __reporter_fastqc(self, dic):
        '''Creates the relative paths for fastqc files'''
        normal = dic['reads']['outputs']['normal']
        curr_normal = [{
            'readgroup':
            i['readgroup'],
            'leftseq':
            os.path.relpath(i['leftseq'], start=self.files['report_path']),
            'rightseq':
            os.path.relpath(i['rightseq'], start=self.files['report_path'])
            if i['paired'] else None,
            'paired':
            i['paired']
        } for i in normal]
        tumor = dic['reads']['outputs']['tumor']
        curr_tumor = [{
            'readgroup':
            i['readgroup'],
            'leftseq':
            os.path.relpath(i['leftseq'], start=self.files['report_path']),
            'rightseq':
            os.path.relpath(i['rightseq'], start=self.files['report_path'])
            if i['paired'] else None,
            'paired':
            i['paired']
        } for i in tumor]
        return {'inputs': {'normal': curr_normal, 'tumor': curr_tumor}}

    def __reporter_alignments(self, dic):
        '''Creates the relative paths for the alignment files'''
        curr = dic['alignments']['outputs']
        return {
            'inputs': {
                'alignment_summary_metrics':
                os.path.relpath(curr['alignment_summary_metrics'],
                                start=self.files['report_path']),
                'insert_size_metrics':
                os.path.relpath(curr['insert_size_metrics'],
                                start=self.files['report_path']),
                'total_coverage':
                os.path.relpath(curr['total_coverage'],
                                start=self.files['report_path'])
            }
        }

    def __reporter_somatic(self, dic):
        '''Creates the relative paths for the somatic files'''
        return {
            'inputs': {
                'somatic_table':
                os.path.relpath(dic['somatic']['outputs']['smd_snp_table'],
                                start=self.files['report_path'])
            }
        }

Example #4

Show file

File: project.py Project: cribioinfo/ExScaliburSMD

class Project(object):
    """Object representing a Tumor/Normal Exome Project

       -- args - the argparser object
       -- PATH - the path to the ExScaliburSMD application directory
       -- assembly - the assembly dictionary from the assembly yaml file
       -- settings - the PipelineSettings object
       -- files    - dictionary of main project paths
       -- samples  - the SampleFactory object
       -- name - the project ID
    """

    def __init__(self, args, PATH):
        self.logger = logging.getLogger("SMD.Project")
        self.args = args
        self.PATH = PATH
        self.assembly = None
        self.settings = None
        self.files = None
        self.name = args.project_id
        self.samples = SampleFactory()
        self.final_reporter_yaml = None
        self.bds_file = None

        # Lists holding the job submission commands to write to the
        # submission scripts
        self.preprocessing_jobs = []
        self.alignments_jobs = []
        self.gatk_jobs = []
        self.somatic_jobs = []
        self.annotation_jobs = []
        self.report_jobs = []
        self.config_list = []

    def initialize_project(self):
        """Wrapper for building the project.
          
           1. Load the settings
           2. Load the assembly
           3. Load the samples
           4. Set the directory structure
        """
        self.logger.info("Project Name: {0.name}".format(self))

        # Settings
        self.settings = PipelineSettings(self.args, self.PATH)
        self.logger.info("Baseline Threads: {0.threads}".format(self.settings))
        self.logger.info("Clip Adapters: {0.clipping}".format(self.settings))
        if self.settings.clipping:
            self.logger.info("Minimum read length: {0.min_length}".format(self.settings))
        self.logger.info("Minimum MAPQ: {0.min_map_q}".format(self.settings))
        self.logger.info("Aligners: {0}".format(self.settings.aln_list))
        self.logger.info("Somatic Mutation Detectors: {0}".format(self.settings.smd_list))

        # Assembly info
        self._set_assembly()
        self.logger.info("Reference Name: {0}".format(self.assembly["refname"]))

        # Samples
        self.samples.load_samples(os.path.abspath(self.args.metadata), self.PATH, self.name)

        # File structure
        self._set_project_files()

    def configure_pipeline(self):
        """Wrapper for creating the configuration files for each sample"""
        self.settings.write_settings_cfg(self)
        for s in self.samples:
            s.write_sample_bds(self, self.settings)
            s.write_sample_report_yaml()
            self.config_list.append(s.sample_cfg)
        self.__write_reporter_yaml()
        self.__write_bds_config()
        self.__write_project_run_script()

    def _set_assembly(self):
        """Grabs the yaml file, validates it, and returns the dictionary"""
        # Load
        in_fh = open(self.args.config_file, "rU")
        chk = yaml.safe_load(in_fh)["reference"]
        in_fh.close()

        # Check the file
        refname = chk.keys()[0]
        required = ["cosmic", "exons_bed", "knowndb", "knownindel", "referenceseq", "sequence_dictionary"]
        if "novoalign" in self.settings.aln_list:
            required.append("novoalign_index")
        for i in required:
            assert i in chk[refname], self.logger.error("Required refrence section '%s' missing" % i)
            if i in ["cosmic", "exons_bed", "knowndb", "knownindel", "referenceseq", "sequence_dictionary"]:
                try:
                    fp = open(chk[refname][i])
                except IOError as e:
                    raise IOError("Unable to open {0} file: {1}".format(i, chk[refname][i]))
                else:
                    fp.close()

        # Set refname and assign the dictionary to the member
        chk[refname]["refname"] = refname
        self.assembly = chk[refname]

    def _set_project_files(self):
        """Sets the project path, log, jobs, config, and results paths for the project.

           project_path - parent directory for this project
           config_path - parent directory for all config files within this project
           job_path - parent directory for all job files within this project
           log_path - parent directory for all log files within this project
           results_path - parent directory for all results files within this project
           report_path - parent directory for all report files within this project
        """
        self.files = {
            "project_path": os.path.abspath(self.args.output_directory),
            "config_path": os.path.join(os.path.abspath(self.args.output_directory), "config"),
            "log_path": os.path.join(os.path.abspath(self.args.output_directory), "logs"),
            "results_path": os.path.join(os.path.abspath(self.args.output_directory), "results"),
            "report_path": os.path.join(os.path.abspath(self.args.output_directory), "report"),
        }
        [check_dir(i) for i in self.files.values()]

    def __write_bds_config(self):
        """Write the default BDS config file for an amazon cloud run"""
        self.bds_file = os.path.join(os.path.abspath(self.args.output_directory), "bds.config")
        with open(self.bds_file, "wb") as o:
            o.write("# BDS configuration file\n")
            o.write("# This is set up to work easily with STAR cluster on EC2\n")
            o.write("# Please see http://pcingola.github.io/BigDataScript/bigDataScript_manual.html#config\n")
            o.write("# for more information.\n\n")

            o.write("# Easy way to change mem usage for all jobs\n")
            o.write("#mem = -1\n\n")

            o.write("# Easy way to declare a particular node for all jobs\n")
            o.write("#node = my_node\n\n")

            o.write("# Easy way to declare a particular queue for all jobs\n")
            o.write("#queue = my_queue\n\n")

            o.write("# Easy way to declare the timeout for all jobs (seconds). It is 24 hours by default\n")
            o.write("#timeout = 86400\n\n")

            o.write("# Number of times a failed job can be retried. We find 0 is the safest.\n")
            o.write("retry = 0\n\n")

            o.write("# Sometimes many qsub commands is not able to be handled on some systems.\n")
            o.write("# Use this to set the number of milliseconds to wait after qsub.\n")
            o.write("waitAfterTaskRun = 200\n\n")

            o.write("# Use bash shell for tasks\n")
            o.write("taskShell = /bin/bash -e\n\n")

            o.write("# SGE parallel environment\n")
            o.write("sge.pe = smp\n\n")

            o.write("# SGE mem argument. We set mem_free as consumable, jobs will wait, but it does not\n")
            o.write("# guarantee that memory will be available (e.g., it will not kill jobs that use more\n")
            o.write("# memory than requested. You can change memory usage for each tool in the ExScalibur configs\n")
            o.write("sge.mem = mem_free\n\n")

            o.write("# SGE timeout variable\n")
            o.write("sge.timeout = h_rt\n\n")

    def __write_project_run_script(self):
        """The script for running ExScalibur"""
        bds_script = os.path.join(self.PATH, "ExScaliburSMD-run.bds")
        main_job = os.path.join(self.files["project_path"], "{0.name}.exscalibur-smd.sh".format(self))
        bds_log = os.path.join(self.files["project_path"], "BDS-System.logs")
        cri_log = os.path.join(self.files["project_path"], "CRI-Info.logs")

        # Create the job script
        with open(main_job, "wb") as o:
            o.write("#!/bin/bash\n")
            o.write("\n")
            o.write("cd {0}\n\n".format(self.files["project_path"]))

            if self.settings.system["module_source"]:
                o.write(". {0}\n".format(self.settings.system["module_source"]))
            if self.settings.system["java"]["use_module"]:
                o.write("module load {0}\n\n".format(self.settings.system["java"]["module"]["name"]))
            o.write("# Command for running the ExScalibur-SMD pipeline\n\n")
            o.write(
                "bds -c "
                + self.bds_file
                + " -reportHtml -reportYaml -v -log -s sge "
                + "{0} -in {1} -options {2} > {3} 2> {4}\n".format(
                    bds_script, " ".join(self.config_list), self.settings.settings_cfg, cri_log, bds_log
                )
            )

        # Make user executable
        import stat

        st = os.stat(main_job)
        os.chmod(main_job, st.st_mode | stat.S_IEXEC)

    ###################################
    ## Reporter functions
    def __write_reporter_yaml(self):
        """Writes the pipeline reporter yaml file"""
        self.final_reporter_yaml = os.path.join(self.files["report_path"], "{0.name}.exscalibur.smd.yaml".format(self))
        dic = {"data": [], "project": self.name}
        for s in self.samples:
            curr = s.reporter_obj
            dic["data"].append(
                {
                    "fastqc": self.__reporter_fastqc(curr),
                    "alignments": self.__reporter_alignments(curr),
                    "somatic": self.__reporter_somatic(curr),
                    "sample": s.name,
                }
            )
        with open(self.final_reporter_yaml, "wb") as o:
            yaml.safe_dump(dic, o)

    def __reporter_fastqc(self, dic):
        """Creates the relative paths for fastqc files"""
        normal = dic["reads"]["outputs"]["normal"]
        curr_normal = [
            {
                "readgroup": i["readgroup"],
                "leftseq": os.path.relpath(i["leftseq"], start=self.files["report_path"]),
                "rightseq": os.path.relpath(i["rightseq"], start=self.files["report_path"]) if i["paired"] else None,
                "paired": i["paired"],
            }
            for i in normal
        ]
        tumor = dic["reads"]["outputs"]["tumor"]
        curr_tumor = [
            {
                "readgroup": i["readgroup"],
                "leftseq": os.path.relpath(i["leftseq"], start=self.files["report_path"]),
                "rightseq": os.path.relpath(i["rightseq"], start=self.files["report_path"]) if i["paired"] else None,
                "paired": i["paired"],
            }
            for i in tumor
        ]
        return {"inputs": {"normal": curr_normal, "tumor": curr_tumor}}

    def __reporter_alignments(self, dic):
        """Creates the relative paths for the alignment files"""
        curr = dic["alignments"]["outputs"]
        return {
            "inputs": {
                "alignment_summary_metrics": os.path.relpath(
                    curr["alignment_summary_metrics"], start=self.files["report_path"]
                ),
                "insert_size_metrics": os.path.relpath(curr["insert_size_metrics"], start=self.files["report_path"]),
                "total_coverage": os.path.relpath(curr["total_coverage"], start=self.files["report_path"]),
            }
        }

    def __reporter_somatic(self, dic):
        """Creates the relative paths for the somatic files"""
        return {
            "inputs": {
                "somatic_table": os.path.relpath(
                    dic["somatic"]["outputs"]["smd_snp_table"], start=self.files["report_path"]
                )
            }
        }