def test_query_dict(): data = {'this': {'that': 'other'}} assert util.query_dict(data, 'this') == {'that': 'other'} assert util.query_dict(data, 'this.that') == 'other' assert util.query_dict(data, 'nonexistent') is None assert util.query_dict(data, 'nonexistent', ret_default='things') == 'things'
def step_info(session, step_name, artifact_udfs=None, sample_udfs=None, output_udfs=None, container_type='96 wells plate'): kwargs = retrieve_args() time_from = kwargs.get('time_from') time_to = kwargs.get('time_to') library_id = query_dict(kwargs, 'match.container_id') project_name = query_dict(kwargs, 'match.project_id') sample_name = query_dict(kwargs, 'match.sample_id') flatten = kwargs.get('flatten', False) in ['True', 'true', True] if container_type == '96 wells plate': y_coords = 'ABCDEFGH' elif container_type == '384 wells plate': y_coords = 'ABCDEFGHIJKLMNOP' else: # For tube the coordinate should be always '1:1' y_coords = '1' all_step_containers = defaultdict(data_models.StepContainer) for data in queries.step_info(session, step_name, time_from=time_from, time_to=time_to, container_name=library_id, project_name=project_name, sample_name=sample_name, artifact_udfs=artifact_udfs, sample_udfs=sample_udfs, output_udfs=output_udfs): luid, daterun, container_id, protocol_name, state_qc, state_modified, sample_id, project_id, wellx, welly = data[: 10] step_container = all_step_containers[container_id] step_container.id = container_id step_container.type = protocol_name specific_step = step_container.specific_steps[luid] specific_step.id = luid specific_step.date_run = daterun location = '%s:%s' % (y_coords[welly], wellx + 1) artifact = specific_step.artifacts[location] artifact.name = sample_id artifact.states[state_modified] = state_qc artifact.project_id = project_id artifact.location = location # Whatever the number of udf type they will come at the end and come on key, value pair for i in range(10, len(data), 2): artifact.udfs[data[i]] = data[i + 1] if flatten: return sorted((item for l in all_step_containers.values() for item in l.to_flatten_json()), key=lambda l: l['id']) else: return sorted((l.to_json() for l in all_step_containers.values()), key=lambda l: l['id'])
def failing_metrics(self): passfails = {} for metric in self.cfg: # resolve the formula if it exist otherwise resolve from the name of the metric if 'formula' in self.cfg[metric]: metric_value = self.resolve_formula( self.reviewable_data, self.cfg[metric]['formula']) else: metric_value = query_dict(self.reviewable_data, metric) comparison = self.cfg[metric]['comparison'] compare_value = self.cfg[metric]['value'] check = None if metric_value is None: check = False elif comparison == '>': check = metric_value >= compare_value elif comparison == '<': check = metric_value <= compare_value passfails[metric] = 'pass' if check else 'fail' return sorted(k for k, v in passfails.items() if v == 'fail')
def resolve_formula(data, formula): modif_formula = formula for word in re.findall('[\w.]+', formula): value = query_dict(data, word) if value: modif_formula = modif_formula.replace(word, str(value)) try: return eval(modif_formula) except NameError: return None
def report(self): runs = query_dict(self.sample, 'aggregated.run_ids') non_useable_runs = sorted(set(r[ELEMENT_RUN_NAME] for r in self.non_useable_run_elements)) s = '%s (yield: %s / %s and coverage: %s / %s from %s) ' % ( super().report(), self._amount_data(), self.required_yield_threshold, self._amount_coverage(), self.required_coverage_threshold, ', '.join(runs) ) if non_useable_runs: s += '(non useable run elements in %s)' % ', '.join(non_useable_runs) else: s += '(no non useable run elements)' return s
def eve_get(*args, **kwargs): res = get(*args, **kwargs) data = res[0].get('data') next_page = query_dict(res[0], '_links.next') # depaginate recursively if next_page: match = re.match('\w+\?page=(\d+)', next_page.get('href')) previous_args = request.args # inject page number in the args of the request to allow eve to pick it up request.args = ImmutableMultiDict({'page': int(match.group(1))}) data.extend(AutomaticReviewer.eve_get(*args, **kwargs)) # restore the args that was there previously request.args = previous_args return data
def _run(self): # Assess if the lanes need filtering q30_threshold = float(cfg.query('fastq_filterer', 'q30_threshold', ret_default=74)) self.info('Q30 threshold: %s', q30_threshold) filter_lanes = {1: False, 2: False, 3: False, 4: False, 5: False, 6: False, 7: False, 8: False} for lane in self.dataset.lane_metrics: if q30_threshold > float(util.query_dict(lane, 'aggregated.pc_q30', ret_default=0)) > 0: self.warning( 'Will apply cycle and tile filtering to lane %s: %%Q30=%s < %s', lane['lane_number'], lane['aggregated']['pc_q30'], q30_threshold ) filter_lanes[int(lane['lane_number'])] = True try: detector = BadTileCycleDetector(self.dataset) bad_tiles = detector.detect_bad_tiles() bad_cycles = detector.detect_bad_cycles() except Exception as e: self.error(e) bad_tiles = {} bad_cycles = {} cmds = [] for lane in filter_lanes: fq_pairs = find_all_fastq_pairs_for_lane(self.fastq_dir, lane) kwargs = {} if filter_lanes[lane]: trim_r1, trim_r2 = get_trim_values_for_bad_cycles(bad_cycles.get(lane), self.dataset.run_info) kwargs = {'tiles_to_filter': bad_tiles.get(lane), 'trim_r2': trim_r2} for fqs in fq_pairs: read_name_list = fqs[0][:-len('_R1_001.fastq.gz')] + '_phix_read_name.list' cmds.append(bash_commands.fastq_filterer(fqs, read_name_list, **kwargs)) return executor.execute( *cmds, prelim_cmds=[bash_commands.fq_filt_prelim_cmd()], job_name='fastq_filterer', working_dir=self.job_dir, cpus=18, mem=10 ).join()
def build_pipeline(dataset): sample_ids = [sample['sample_id'] for sample in dataset.samples_processed] project_source = os.path.join(cfg.query('project', 'input_dir'), dataset.name) gvcf_files = [] for sample in dataset.samples_processed: # Only check if we have gvcf when the samples have been through human processing that generate a gvcf if query_dict(sample, 'aggregated.most_recent_proc.pipeline_used.name') == 'bcbio': gvcf_file = find_file(project_source, sample['sample_id'], sample['user_sample_id'] + '.g.vcf.gz') if not gvcf_file: raise PipelineError('Unable to find gVCF file for sample %s in %s' % (sample['sample_id'], project_source)) gvcf_files.append(gvcf_file) if len(gvcf_files) < 2: # No need to run as there are not enough gvcfs to process cleanup = Cleanup(dataset=dataset) else: genotype_gvcfs = GenotypeGVCFs(dataset=dataset, gVCFs=gvcf_files) relatedness = Relatedness(dataset=dataset, previous_stages=[genotype_gvcfs]) peddy = Peddy(dataset=dataset, ids=sample_ids, previous_stages=[genotype_gvcfs]) parse = ParseRelatedness(dataset=dataset, ids=sample_ids, parse_method='parse_both', previous_stages=[relatedness, peddy]) md5 = MD5Sum(dataset=dataset, previous_stages=[parse]) output = Output(dataset=dataset, previous_stages=[md5]) cleanup = Cleanup(dataset=dataset, previous_stages=[output]) return cleanup
def expect_qc_data(self, obs, exp): for e in sorted(exp): self.expect_equal(util.query_dict(obs, e), exp[e], e)
def _get_datasets_for_statuses(self, statuses): self.debug('Creating Datasets for status %s', ', '.join(statuses)) return [ self.get_dataset(d[self.item_id], query_dict(d, 'aggregated.most_recent_proc')) for d in self._get_dataset_records_for_statuses(statuses) ]
def _amount_coverage(self): return query_dict(self.sample, 'aggregated.from_run_elements.mean_coverage') or 0
def _amount_data(self): y = query_dict(self.sample, 'aggregated.clean_yield_in_gb') if y: return int(y * 1000000000) else: return 0