def test_parse_file(self, mock_load_xl): mock_load_xl.return_value.sheetnames = ['sheet1'] mock_load_xl.return_value.__getitem__.return_value = MOCK_EXCEL_SHEET for ext, data in TEST_DATA_TYPES.items(): self.assertListEqual(parse_file('test.{}'.format(ext), StringIO(data.decode('utf-8'))), PARSED_DATA) for call_args in mock_load_xl.call_args_list: self.assertEqual(call_args.args[0].read().encode('utf-8'), EXCEL_DATA) self.assertDictEqual(call_args.kwargs, {'read_only': True})
def _load_mapping_file(mapping_file_id, mapping_file_path): id_mapping = {} file_content = [] if mapping_file_id: file_content = load_uploaded_file(mapping_file_id) elif mapping_file_path: file_content = parse_file(mapping_file_path, file_iter(mapping_file_path)) for line in file_content: if len(line) != 2: raise ValueError("Must contain 2 columns: " + ', '.join(line)) id_mapping[line[0]] = line[1] return id_mapping
def handle(self, *args, **options): file_path = options['file'] with open(file_path) as f: file_input = parse_file(file_path, f) project_categories = defaultdict(list) for row in file_input[1:]: if row[1]: for category in row[1].split(','): project_categories[category.strip()].append(row[0].strip()) for category_name, projects in project_categories.items(): project_category = ProjectCategory.objects.create( name=category_name) project_category.projects.set( Project.objects.filter(name__in=projects)) print('{}: {} projects'.format(category_name, len(projects)))
def upload_qc_pipeline_output(request): file_path = json.loads(request.body)['file'] raw_records = parse_file(file_path, file_iter(file_path)) json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]] missing_columns = [ field for field in [ 'seqr_id', 'data_type', 'filter_flags', 'qc_metrics_filters', 'qc_pop' ] if field not in json_records[0] ] if missing_columns: message = 'The following required columns are missing: {}'.format( ', '.join(missing_columns)) return create_json_response({'errors': [message]}, status=400, reason=message) dataset_types = { record['data_type'].lower() for record in json_records if record['data_type'].lower() != 'n/a' } if len(dataset_types) == 0: message = 'No dataset type detected' return create_json_response({'errors': [message]}, status=400, reason=message) elif len(dataset_types) > 1: message = 'Multiple dataset types detected: {}'.format( ' ,'.join(dataset_types)) return create_json_response({'errors': [message]}, status=400, reason=message) elif list(dataset_types)[0] not in DATASET_TYPE_MAP: message = 'Unexpected dataset type detected: "{}" (should be "exome" or "genome")'.format( list(dataset_types)[0]) return create_json_response({'errors': [message]}, status=400, reason=message) dataset_type = DATASET_TYPE_MAP[list(dataset_types)[0]] info_message = 'Parsed {} {} samples'.format(len(json_records), dataset_type) logger.info(info_message) info = [info_message] warnings = [] sample_ids = {record['seqr_id'] for record in json_records} samples = Sample.objects.filter( sample_id__in=sample_ids, sample_type=Sample.SAMPLE_TYPE_WES if dataset_type == 'exome' else Sample.SAMPLE_TYPE_WGS, ).exclude(individual__family__project__name__in=EXCLUDE_PROJECTS).exclude( individual__family__project__projectcategory__name= EXCLUDE_PROJECT_CATEGORY) sample_individuals = { agg['sample_id']: agg['individuals'] for agg in samples.values('sample_id').annotate( individuals=ArrayAgg('individual_id', distinct=True)) } sample_individual_max_loaded_date = { agg['individual_id']: agg['max_loaded_date'] for agg in samples.values('individual_id').annotate( max_loaded_date=Max('loaded_date')) } individual_latest_sample_id = { s.individual_id: s.sample_id for s in samples if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id) } for record in json_records: record['individual_ids'] = list({ individual_id for individual_id in sample_individuals.get(record['seqr_id'], []) if individual_latest_sample_id[individual_id] == record['seqr_id'] }) missing_sample_ids = { record['seqr_id'] for record in json_records if not record['individual_ids'] } if missing_sample_ids: individuals = Individual.objects.filter( individual_id__in=missing_sample_ids ).exclude(family__project__name__in=EXCLUDE_PROJECTS).exclude( family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY ).exclude(sample__sample_type=Sample.SAMPLE_TYPE_WGS if dataset_type == 'exome' else Sample.SAMPLE_TYPE_WES) individual_db_ids_by_id = defaultdict(list) for individual in individuals: individual_db_ids_by_id[individual.individual_id].append( individual.id) for record in json_records: if not record['individual_ids'] and len( individual_db_ids_by_id[record['seqr_id']]) == 1: record['individual_ids'] = individual_db_ids_by_id[ record['seqr_id']] missing_sample_ids.remove(record['seqr_id']) multi_individual_samples = { record['seqr_id']: len(record['individual_ids']) for record in json_records if len(record['individual_ids']) > 1 } if multi_individual_samples: logger.info('Found {} multi-individual samples from qc output'.format( len(multi_individual_samples))) warnings.append( 'The following {} samples were added to multiple individuals: {}'. format( len(multi_individual_samples), ', '.join( sorted([ '{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items() ])))) if missing_sample_ids: logger.info('Missing {} samples from qc output'.format( len(missing_sample_ids))) warnings.append('The following {} samples were skipped: {}'.format( len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids))))) unknown_filter_flags = set() unknown_pop_filter_flags = set() inidividuals_by_population = defaultdict(list) for record in json_records: filter_flags = {} for flag in json.loads(record['filter_flags']): flag = '{}_{}'.format(flag, dataset_type) if flag == 'coverage' else flag flag_col = FILTER_FLAG_COL_MAP.get(flag, flag) if flag_col in record: filter_flags[flag] = record[flag_col] else: unknown_filter_flags.add(flag) pop_platform_filters = {} for flag in json.loads(record['qc_metrics_filters']): flag_col = 'sample_qc.{}'.format(flag) if flag_col in record: pop_platform_filters[flag] = record[flag_col] else: unknown_pop_filter_flags.add(flag) if filter_flags or pop_platform_filters: Individual.objects.filter(id__in=record['individual_ids']).update( filter_flags=filter_flags or None, pop_platform_filters=pop_platform_filters or None) inidividuals_by_population[ record['qc_pop'].upper()] += record['individual_ids'] for population, indiv_ids in inidividuals_by_population.items(): Individual.objects.filter(id__in=indiv_ids).update( population=population) if unknown_filter_flags: message = 'The following filter flags have no known corresponding value and were not saved: {}'.format( ', '.join(unknown_filter_flags)) logger.info(message) warnings.append(message) if unknown_pop_filter_flags: message = 'The following population platform filters have no known corresponding value and were not saved: {}'.format( ', '.join(unknown_pop_filter_flags)) logger.info(message) warnings.append(message) message = 'Found and updated matching seqr individuals for {} samples'.format( len(json_records) - len(missing_sample_ids)) info.append(message) logger.info(message) return create_json_response({ 'errors': [], 'warnings': warnings, 'info': info, })
def load_mapping_file(mapping_file_path, user): file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user)) return _load_mapping_file(file_content)
def upload_qc_pipeline_output(request): file_path = json.loads(request.body)['file'] raw_records = parse_file(file_path, file_iter(file_path)) json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]] try: dataset_type, data_type, records_by_sample_id = _parse_raw_qc_records(json_records) except ValueError as e: return create_json_response({'errors': [str(e)]}, status=400, reason=str(e)) info_message = 'Parsed {} {} samples'.format( len(json_records), 'SV' if dataset_type == Sample.DATASET_TYPE_SV_CALLS else data_type) logger.info(info_message) info = [info_message] warnings = [] samples = Sample.objects.filter( sample_id__in=records_by_sample_id.keys(), sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS, dataset_type=dataset_type, ).exclude( individual__family__project__name__in=EXCLUDE_PROJECTS ).exclude(individual__family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY) sample_individuals = { agg['sample_id']: agg['individuals'] for agg in samples.values('sample_id').annotate(individuals=ArrayAgg('individual_id', distinct=True)) } sample_individual_max_loaded_date = { agg['individual_id']: agg['max_loaded_date'] for agg in samples.values('individual_id').annotate(max_loaded_date=Max('loaded_date')) } individual_latest_sample_id = { s.individual_id: s.sample_id for s in samples if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id) } for sample_id, record in records_by_sample_id.items(): record['individual_ids'] = list({ individual_id for individual_id in sample_individuals.get(sample_id, []) if individual_latest_sample_id[individual_id] == sample_id }) missing_sample_ids = {sample_id for sample_id, record in records_by_sample_id.items() if not record['individual_ids']} if missing_sample_ids: individuals = Individual.objects.filter(individual_id__in=missing_sample_ids).exclude( family__project__name__in=EXCLUDE_PROJECTS).exclude( family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY).filter( sample__sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS).distinct() individual_db_ids_by_id = defaultdict(list) for individual in individuals: individual_db_ids_by_id[individual.individual_id].append(individual.id) for sample_id, record in records_by_sample_id.items(): if not record['individual_ids'] and len(individual_db_ids_by_id[sample_id]) >= 1: record['individual_ids'] = individual_db_ids_by_id[sample_id] missing_sample_ids.remove(sample_id) multi_individual_samples = { sample_id: len(record['individual_ids']) for sample_id, record in records_by_sample_id.items() if len(record['individual_ids']) > 1} if multi_individual_samples: logger.info('Found {} multi-individual samples from qc output'.format(len(multi_individual_samples))) warnings.append('The following {} samples were added to multiple individuals: {}'.format( len(multi_individual_samples), ', '.join( sorted(['{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items()])))) if missing_sample_ids: logger.info('Missing {} samples from qc output'.format(len(missing_sample_ids))) warnings.append('The following {} samples were skipped: {}'.format( len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids))))) records_with_individuals = [ record for sample_id, record in records_by_sample_id.items() if sample_id not in missing_sample_ids ] if dataset_type == Sample.DATASET_TYPE_SV_CALLS: _update_individuals_sv_qc(records_with_individuals, request.user) else: _update_individuals_variant_qc(records_with_individuals, data_type, warnings, request.user) message = 'Found and updated matching seqr individuals for {} samples'.format(len(json_records) - len(missing_sample_ids)) info.append(message) logger.info(message) return create_json_response({ 'errors': [], 'warnings': warnings, 'info': info, })
def load_mapping_file(mapping_file_path): file_content = parse_file(mapping_file_path, file_utils.file_iter(mapping_file_path)) return _load_mapping_file(file_content)