def pipeline(dataset):
    """
    Decide which pipeline to run on a dataset, and run luigi.build.
    :param analysis_driver.dataset.Dataset dataset:
    """
    luigi.interface.setup_interface_logging.has_run = True  # turn off Luigi's default logging setup
    log_cfg.get_logger('luigi-interface', 20)  # just calling log_cfg.get_logger registers the luigi-interface

    dataset.resolve_pipeline_and_toolset()
    dataset.start()
    final_stage = dataset.pipeline.build_pipeline(dataset)

    luigi_params = {
        'tasks': [final_stage],
        'local_scheduler': cfg.query('luigi', 'local_scheduler'),
        'workers': cfg.query('luigi', 'max_parallel_jobs', ret_default=4)
    }
    if luigi_params['local_scheduler'] is not True:
        luigi_params['scheduler_url'] = cfg['luigi']['scheduler_url']

    success = luigi.build(**luigi_params)

    # if any exception occurred during the pipeline raise them here again
    dataset.raise_exceptions()

    return 0 if success is True else 9
    def __init__(self, dataset, window_size=60, tile_quality_threshold=None, cycle_quality_threshold=None):
        self.dataset = dataset
        self.run_dir = dataset.input_dir
        self.tile_ids = dataset.run_info.tiles
        self.ncycles = sum(Reads.num_cycles(r) for r in dataset.run_info.reads.reads)
        self.window_size = window_size
        self.all_lanes = None
        self.tile_quality_threshold = tile_quality_threshold or cfg.query(
            'fastq_filterer', 'tile_quality_threshold', ret_default=20
        )
        self.cycle_quality_threshold = cycle_quality_threshold or cfg.query(
            'fastq_filterer', 'cycle_quality_threshold', ret_default=18
        )

        self.read_interop_metrics()
Beispiel #3
0
def get_genome_version(sample_id, species=None):
    s = get_sample(sample_id)
    if not s:
        return None
    genome_version = s.udf.get('Genome Version', None)
    if not genome_version and species:
        return cfg.query('species', species, 'default')
    return genome_version
Beispiel #4
0
 def __init__(self, *cmds, prelim_cmds=None, **cluster_config):
     """
     :param cmds: Full path to a job submission script
     """
     self.interval = cfg.query('executor', 'join_interval', ret_default=30)
     self.job_id = None
     self.job_name = cluster_config['job_name']
     self.cmds = cmds
     self.prelim_cmds = prelim_cmds
     self.writer = self.script_writer(**cluster_config)
Beispiel #5
0
def execute(*cmds, env=None, prelim_cmds=None, **cluster_config):
    if env is None:
        env = cfg.query('executor', 'job_execution')

    if env == 'local':
        return local_execute(*cmds)
    else:
        return cluster_execute(*cmds,
                               env=env,
                               prelim_cmds=prelim_cmds,
                               **cluster_config)
    def write_script(self):
        if self.prelim_cmds:
            self.writer.register_cmds(*self.prelim_cmds, parallel=False)

        pre_job_source = cfg.query('executor', 'pre_job_source')
        if pre_job_source:
            self.writer.register_cmd('source ' + pre_job_source)

        self.writer.line_break()
        self.writer.register_cmds(*self.cmds, parallel=True)
        self.writer.add_header()
        self.writer.save()
Beispiel #7
0
def cluster_execute(*cmds, env=None, prelim_cmds=None, **cluster_config):
    """
    Execute commands on a compute cluster
    :param cmds:
    :param env: The kind of resource manager being run
    :param prelim_cmds: Any commands to execute before starting a job array
    :param cluster_config:
    :return: ClusterExecutor
    """
    env = env or cfg.query('executor', 'job_execution')
    if env == 'slurm':
        cls = SlurmExecutor
    else:
        raise EGCGError('Unknown execution environment: %s' % env)

    e = cls(*cmds, prelim_cmds=prelim_cmds, **cluster_config)
    e.start()
    return e
    def _run(self):
        # Assess if the lanes need filtering
        q30_threshold = float(cfg.query('fastq_filterer', 'q30_threshold', ret_default=74))
        self.info('Q30 threshold: %s', q30_threshold)
        filter_lanes = {1: False, 2: False, 3: False, 4: False, 5: False, 6: False, 7: False, 8: False}
        for lane in self.dataset.lane_metrics:
            if q30_threshold > float(util.query_dict(lane, 'aggregated.pc_q30', ret_default=0)) > 0:
                self.warning(
                    'Will apply cycle and tile filtering to lane %s: %%Q30=%s < %s',
                    lane['lane_number'],
                    lane['aggregated']['pc_q30'],
                    q30_threshold
                )
                filter_lanes[int(lane['lane_number'])] = True

        try:
            detector = BadTileCycleDetector(self.dataset)
            bad_tiles = detector.detect_bad_tiles()
            bad_cycles = detector.detect_bad_cycles()
        except Exception as e:
            self.error(e)
            bad_tiles = {}
            bad_cycles = {}

        cmds = []
        for lane in filter_lanes:
            fq_pairs = find_all_fastq_pairs_for_lane(self.fastq_dir, lane)
            kwargs = {}
            if filter_lanes[lane]:
                trim_r1, trim_r2 = get_trim_values_for_bad_cycles(bad_cycles.get(lane), self.dataset.run_info)
                kwargs = {'tiles_to_filter': bad_tiles.get(lane), 'trim_r2': trim_r2}

            for fqs in fq_pairs:
                read_name_list = fqs[0][:-len('_R1_001.fastq.gz')] + '_phix_read_name.list'
                cmds.append(bash_commands.fastq_filterer(fqs, read_name_list, **kwargs))

        return executor.execute(
            *cmds,
            prelim_cmds=[bash_commands.fq_filt_prelim_cmd()],
            job_name='fastq_filterer',
            working_dir=self.job_dir,
            cpus=18,
            mem=10
        ).join()
    def _req(self, method, url, quiet=False, retries=5, **kwargs):
        # can't upload json and files at the same time, so we need to move the json parameter to data
        # data can't upload complex structures that would require json encoding.
        # this means we can't upload data with nested lists/dicts at the same time as files
        if kwargs.get('files') and kwargs.get('json'):
            if check_if_nested(kwargs.get('json')):
                raise RestCommunicationError(
                    'Cannot upload files and nested json in one query')
            kwargs['data'] = kwargs.pop('json')

        try:
            with self.lock:
                r = self.session.request(method, url, **kwargs)
        except Exception as e:
            if retries > 0:
                self.warning(
                    'Encountered a %s exception. %s retries remaining', str(e),
                    retries)
                sleep(cfg.query('rest_api', 'retry_interval', ret_default=1))
                return self._req(method, url, quiet, retries - 1, **kwargs)
            else:
                raise

        kwargs.pop('files', None)
        # e.g: 'POST <url> ({"some": "args"}) -> {"some": "content"}. Status code 201. Reason: CREATED
        report = '%s %s (%s) -> %s. Status code %s. Reason: %s' % (
            r.request.method, r.request.path_url, kwargs,
            r.content.decode('utf-8'), r.status_code, r.reason)
        if r.status_code in self.successful_statuses:
            if not quiet:
                self.debug(report)
            return r
        else:
            self.error(report)
            raise RestCommunicationError('Encountered a %s status code: %s' %
                                         (r.status_code, r.reason))
 def __init__(self):
     self.input_dir = cfg.query(self.type, 'input_dir')
     self.__triggerignore = None