def test_query_dict():
    data = {'this': {'that': 'other'}}
    assert util.query_dict(data, 'this') == {'that': 'other'}
    assert util.query_dict(data, 'this.that') == 'other'
    assert util.query_dict(data, 'nonexistent') is None
    assert util.query_dict(data, 'nonexistent',
                           ret_default='things') == 'things'
Exemple #2
0
def step_info(session,
              step_name,
              artifact_udfs=None,
              sample_udfs=None,
              output_udfs=None,
              container_type='96 wells plate'):
    kwargs = retrieve_args()
    time_from = kwargs.get('time_from')
    time_to = kwargs.get('time_to')
    library_id = query_dict(kwargs, 'match.container_id')
    project_name = query_dict(kwargs, 'match.project_id')
    sample_name = query_dict(kwargs, 'match.sample_id')
    flatten = kwargs.get('flatten', False) in ['True', 'true', True]
    if container_type == '96 wells plate':
        y_coords = 'ABCDEFGH'
    elif container_type == '384 wells plate':
        y_coords = 'ABCDEFGHIJKLMNOP'
    else:
        # For tube the coordinate should be always '1:1'
        y_coords = '1'
    all_step_containers = defaultdict(data_models.StepContainer)
    for data in queries.step_info(session,
                                  step_name,
                                  time_from=time_from,
                                  time_to=time_to,
                                  container_name=library_id,
                                  project_name=project_name,
                                  sample_name=sample_name,
                                  artifact_udfs=artifact_udfs,
                                  sample_udfs=sample_udfs,
                                  output_udfs=output_udfs):

        luid, daterun, container_id, protocol_name, state_qc, state_modified, sample_id, project_id, wellx, welly = data[:
                                                                                                                         10]

        step_container = all_step_containers[container_id]
        step_container.id = container_id
        step_container.type = protocol_name
        specific_step = step_container.specific_steps[luid]
        specific_step.id = luid
        specific_step.date_run = daterun
        location = '%s:%s' % (y_coords[welly], wellx + 1)

        artifact = specific_step.artifacts[location]
        artifact.name = sample_id
        artifact.states[state_modified] = state_qc
        artifact.project_id = project_id
        artifact.location = location
        # Whatever the number of udf type they will come at the end and come on key, value pair
        for i in range(10, len(data), 2):
            artifact.udfs[data[i]] = data[i + 1]

    if flatten:
        return sorted((item for l in all_step_containers.values()
                       for item in l.to_flatten_json()),
                      key=lambda l: l['id'])
    else:
        return sorted((l.to_json() for l in all_step_containers.values()),
                      key=lambda l: l['id'])
Exemple #3
0
    def failing_metrics(self):
        passfails = {}

        for metric in self.cfg:
            # resolve the formula if it exist otherwise resolve from the name of the metric
            if 'formula' in self.cfg[metric]:
                metric_value = self.resolve_formula(
                    self.reviewable_data, self.cfg[metric]['formula'])
            else:
                metric_value = query_dict(self.reviewable_data, metric)
            comparison = self.cfg[metric]['comparison']
            compare_value = self.cfg[metric]['value']

            check = None
            if metric_value is None:
                check = False

            elif comparison == '>':
                check = metric_value >= compare_value

            elif comparison == '<':
                check = metric_value <= compare_value

            passfails[metric] = 'pass' if check else 'fail'

        return sorted(k for k, v in passfails.items() if v == 'fail')
Exemple #4
0
 def resolve_formula(data, formula):
     modif_formula = formula
     for word in re.findall('[\w.]+', formula):
         value = query_dict(data, word)
         if value:
             modif_formula = modif_formula.replace(word, str(value))
     try:
         return eval(modif_formula)
     except NameError:
         return None
    def report(self):
        runs = query_dict(self.sample, 'aggregated.run_ids')
        non_useable_runs = sorted(set(r[ELEMENT_RUN_NAME] for r in self.non_useable_run_elements))

        s = '%s  (yield: %s / %s and coverage: %s / %s from %s) ' % (
            super().report(), self._amount_data(), self.required_yield_threshold, self._amount_coverage(),
            self.required_coverage_threshold, ', '.join(runs)
        )
        if non_useable_runs:
            s += '(non useable run elements in %s)' % ', '.join(non_useable_runs)
        else:
            s += '(no non useable run elements)'
        return s
Exemple #6
0
 def eve_get(*args, **kwargs):
     res = get(*args, **kwargs)
     data = res[0].get('data')
     next_page = query_dict(res[0], '_links.next')
     # depaginate recursively
     if next_page:
         match = re.match('\w+\?page=(\d+)', next_page.get('href'))
         previous_args = request.args
         # inject page number in the args of the request to allow eve to pick it up
         request.args = ImmutableMultiDict({'page': int(match.group(1))})
         data.extend(AutomaticReviewer.eve_get(*args, **kwargs))
         # restore the args that was there previously
         request.args = previous_args
     return data
    def _run(self):
        # Assess if the lanes need filtering
        q30_threshold = float(cfg.query('fastq_filterer', 'q30_threshold', ret_default=74))
        self.info('Q30 threshold: %s', q30_threshold)
        filter_lanes = {1: False, 2: False, 3: False, 4: False, 5: False, 6: False, 7: False, 8: False}
        for lane in self.dataset.lane_metrics:
            if q30_threshold > float(util.query_dict(lane, 'aggregated.pc_q30', ret_default=0)) > 0:
                self.warning(
                    'Will apply cycle and tile filtering to lane %s: %%Q30=%s < %s',
                    lane['lane_number'],
                    lane['aggregated']['pc_q30'],
                    q30_threshold
                )
                filter_lanes[int(lane['lane_number'])] = True

        try:
            detector = BadTileCycleDetector(self.dataset)
            bad_tiles = detector.detect_bad_tiles()
            bad_cycles = detector.detect_bad_cycles()
        except Exception as e:
            self.error(e)
            bad_tiles = {}
            bad_cycles = {}

        cmds = []
        for lane in filter_lanes:
            fq_pairs = find_all_fastq_pairs_for_lane(self.fastq_dir, lane)
            kwargs = {}
            if filter_lanes[lane]:
                trim_r1, trim_r2 = get_trim_values_for_bad_cycles(bad_cycles.get(lane), self.dataset.run_info)
                kwargs = {'tiles_to_filter': bad_tiles.get(lane), 'trim_r2': trim_r2}

            for fqs in fq_pairs:
                read_name_list = fqs[0][:-len('_R1_001.fastq.gz')] + '_phix_read_name.list'
                cmds.append(bash_commands.fastq_filterer(fqs, read_name_list, **kwargs))

        return executor.execute(
            *cmds,
            prelim_cmds=[bash_commands.fq_filt_prelim_cmd()],
            job_name='fastq_filterer',
            working_dir=self.job_dir,
            cpus=18,
            mem=10
        ).join()
def build_pipeline(dataset):
    sample_ids = [sample['sample_id'] for sample in dataset.samples_processed]
    project_source = os.path.join(cfg.query('project', 'input_dir'), dataset.name)
    gvcf_files = []
    for sample in dataset.samples_processed:
        # Only check if we have gvcf when the samples have been through human processing that generate a gvcf
        if query_dict(sample, 'aggregated.most_recent_proc.pipeline_used.name') == 'bcbio':
            gvcf_file = find_file(project_source, sample['sample_id'], sample['user_sample_id'] + '.g.vcf.gz')
            if not gvcf_file:
                raise PipelineError('Unable to find gVCF file for sample %s in %s' % (sample['sample_id'], project_source))
            gvcf_files.append(gvcf_file)

    if len(gvcf_files) < 2:
        # No need to run as there are not enough gvcfs to process
        cleanup = Cleanup(dataset=dataset)
    else:
        genotype_gvcfs = GenotypeGVCFs(dataset=dataset, gVCFs=gvcf_files)
        relatedness = Relatedness(dataset=dataset, previous_stages=[genotype_gvcfs])
        peddy = Peddy(dataset=dataset, ids=sample_ids, previous_stages=[genotype_gvcfs])
        parse = ParseRelatedness(dataset=dataset, ids=sample_ids, parse_method='parse_both', previous_stages=[relatedness, peddy])
        md5 = MD5Sum(dataset=dataset, previous_stages=[parse])
        output = Output(dataset=dataset, previous_stages=[md5])
        cleanup = Cleanup(dataset=dataset, previous_stages=[output])
    return cleanup
 def expect_qc_data(self, obs, exp):
     for e in sorted(exp):
         self.expect_equal(util.query_dict(obs, e), exp[e], e)
 def _get_datasets_for_statuses(self, statuses):
     self.debug('Creating Datasets for status %s', ', '.join(statuses))
     return [
         self.get_dataset(d[self.item_id], query_dict(d, 'aggregated.most_recent_proc'))
         for d in self._get_dataset_records_for_statuses(statuses)
     ]
 def _amount_coverage(self):
     return query_dict(self.sample, 'aggregated.from_run_elements.mean_coverage') or 0
 def _amount_data(self):
     y = query_dict(self.sample, 'aggregated.clean_yield_in_gb')
     if y:
         return int(y * 1000000000)
     else:
         return 0