def _stream_file(request, path): # based on https://gist.github.com/dcwatson/cb5d8157a8fa5a4a046e content_type = 'application/octet-stream' range_header = request.META.get('HTTP_RANGE', None) if range_header: range_match = re.compile(r'bytes\s*=\s*(\d+)\s*-\s*(\d*)', re.I).match(range_header) first_byte, last_byte = range_match.groups() first_byte = int(first_byte) if first_byte else 0 last_byte = int(last_byte) length = last_byte - first_byte + 1 resp = StreamingHttpResponse(file_iter(path, byte_range=(first_byte, last_byte), raw_content=True, user=request.user), status=206, content_type=content_type) resp['Content-Length'] = str(length) resp['Content-Range'] = 'bytes %s-%s' % (first_byte, last_byte) else: resp = StreamingHttpResponse(file_iter(path, raw_content=True, user=request.user), content_type=content_type) resp['Accept-Ranges'] = 'bytes' return resp
def _validate_vcf(vcf_path, sample_type=None, genome_version=None): if not vcf_path or not isinstance(vcf_path, basestring): raise ValueError("Invalid vcf_path arg: %(vcf_path)s" % locals()) if not does_file_exist(vcf_path): raise ValueError("%(vcf_path)s not found" % locals()) header_line = None for i, line in enumerate(file_iter(vcf_path)): if line.startswith("#CHROM"): header_line = line break if line.startswith("#"): continue else: break if i > 20000: break # there's no way header is this long if not header_line: raise ValueError( "Unexpected VCF header. #CHROM not found before line: " + line) # TODO if annotating using gcloud, check whether dataproc has access to file # TODO check header, sample_type, genome_version header_fields = header_line.strip().split('\t') sample_ids = header_fields[9:] return sample_ids
def handle(self, *args, **options): # parse and validate args validate_only = options["validate_only"] project_guid = options["project_id"] pedigree_file_path = options["pedigree_file"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals()) # parse the pedigree file if specified input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table( pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records)
def _load_mapping_file(mapping_file_id, mapping_file_path): id_mapping = {} file_content = [] if mapping_file_id: file_content = load_uploaded_file(mapping_file_id) elif mapping_file_path: file_content = parse_file(mapping_file_path, file_iter(mapping_file_path)) for line in file_content: if len(line) != 2: raise ValueError("Must contain 2 columns: " + ', '.join(line)) id_mapping[line[0]] = line[1] return id_mapping
def add_individuals_from_pedigree_file(project, pedigree_file_path, validate_only=False): if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals()) # parse the pedigree file if specified input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records)
def _validate_vcf(vcf_path): header_line = None for line in file_iter(vcf_path): if line.startswith("#CHROM"): header_line = line break if line.startswith("#"): continue else: break if not header_line: raise Exception("Unexpected VCF header. #CHROM not found before line: {}".format(line)) header_fields = header_line.strip().split('\t') sample_ids = header_fields[9:] if not sample_ids: raise Exception('No samples found in VCF "{}"'.format(vcf_path))
def _validate_vcf(vcf_path): header_line = None for line in file_utils.file_iter(vcf_path): if line.startswith("#CHROM"): header_line = line break if line.startswith("#"): continue else: break if not header_line: raise Exception("Unexpected VCF header. #CHROM not found before line: {}".format(line)) header_fields = header_line.strip().split('\t') sample_ids = header_fields[9:] if not sample_ids: raise Exception('No samples found in VCF "{}"'.format(vcf_path))
def upload_qc_pipeline_output(request): file_path = json.loads(request.body)['file'] raw_records = parse_file(file_path, file_iter(file_path)) json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]] missing_columns = [ field for field in [ 'seqr_id', 'data_type', 'filter_flags', 'qc_metrics_filters', 'qc_pop' ] if field not in json_records[0] ] if missing_columns: message = 'The following required columns are missing: {}'.format( ', '.join(missing_columns)) return create_json_response({'errors': [message]}, status=400, reason=message) dataset_types = { record['data_type'].lower() for record in json_records if record['data_type'].lower() != 'n/a' } if len(dataset_types) == 0: message = 'No dataset type detected' return create_json_response({'errors': [message]}, status=400, reason=message) elif len(dataset_types) > 1: message = 'Multiple dataset types detected: {}'.format( ' ,'.join(dataset_types)) return create_json_response({'errors': [message]}, status=400, reason=message) elif list(dataset_types)[0] not in DATASET_TYPE_MAP: message = 'Unexpected dataset type detected: "{}" (should be "exome" or "genome")'.format( list(dataset_types)[0]) return create_json_response({'errors': [message]}, status=400, reason=message) dataset_type = DATASET_TYPE_MAP[list(dataset_types)[0]] info_message = 'Parsed {} {} samples'.format(len(json_records), dataset_type) logger.info(info_message) info = [info_message] warnings = [] sample_ids = {record['seqr_id'] for record in json_records} samples = Sample.objects.filter( sample_id__in=sample_ids, sample_type=Sample.SAMPLE_TYPE_WES if dataset_type == 'exome' else Sample.SAMPLE_TYPE_WGS, ).exclude(individual__family__project__name__in=EXCLUDE_PROJECTS).exclude( individual__family__project__projectcategory__name= EXCLUDE_PROJECT_CATEGORY) sample_individuals = { agg['sample_id']: agg['individuals'] for agg in samples.values('sample_id').annotate( individuals=ArrayAgg('individual_id', distinct=True)) } sample_individual_max_loaded_date = { agg['individual_id']: agg['max_loaded_date'] for agg in samples.values('individual_id').annotate( max_loaded_date=Max('loaded_date')) } individual_latest_sample_id = { s.individual_id: s.sample_id for s in samples if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id) } for record in json_records: record['individual_ids'] = list({ individual_id for individual_id in sample_individuals.get(record['seqr_id'], []) if individual_latest_sample_id[individual_id] == record['seqr_id'] }) missing_sample_ids = { record['seqr_id'] for record in json_records if not record['individual_ids'] } if missing_sample_ids: individuals = Individual.objects.filter( individual_id__in=missing_sample_ids ).exclude(family__project__name__in=EXCLUDE_PROJECTS).exclude( family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY ).exclude(sample__sample_type=Sample.SAMPLE_TYPE_WGS if dataset_type == 'exome' else Sample.SAMPLE_TYPE_WES) individual_db_ids_by_id = defaultdict(list) for individual in individuals: individual_db_ids_by_id[individual.individual_id].append( individual.id) for record in json_records: if not record['individual_ids'] and len( individual_db_ids_by_id[record['seqr_id']]) == 1: record['individual_ids'] = individual_db_ids_by_id[ record['seqr_id']] missing_sample_ids.remove(record['seqr_id']) multi_individual_samples = { record['seqr_id']: len(record['individual_ids']) for record in json_records if len(record['individual_ids']) > 1 } if multi_individual_samples: logger.info('Found {} multi-individual samples from qc output'.format( len(multi_individual_samples))) warnings.append( 'The following {} samples were added to multiple individuals: {}'. format( len(multi_individual_samples), ', '.join( sorted([ '{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items() ])))) if missing_sample_ids: logger.info('Missing {} samples from qc output'.format( len(missing_sample_ids))) warnings.append('The following {} samples were skipped: {}'.format( len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids))))) unknown_filter_flags = set() unknown_pop_filter_flags = set() inidividuals_by_population = defaultdict(list) for record in json_records: filter_flags = {} for flag in json.loads(record['filter_flags']): flag = '{}_{}'.format(flag, dataset_type) if flag == 'coverage' else flag flag_col = FILTER_FLAG_COL_MAP.get(flag, flag) if flag_col in record: filter_flags[flag] = record[flag_col] else: unknown_filter_flags.add(flag) pop_platform_filters = {} for flag in json.loads(record['qc_metrics_filters']): flag_col = 'sample_qc.{}'.format(flag) if flag_col in record: pop_platform_filters[flag] = record[flag_col] else: unknown_pop_filter_flags.add(flag) if filter_flags or pop_platform_filters: Individual.objects.filter(id__in=record['individual_ids']).update( filter_flags=filter_flags or None, pop_platform_filters=pop_platform_filters or None) inidividuals_by_population[ record['qc_pop'].upper()] += record['individual_ids'] for population, indiv_ids in inidividuals_by_population.items(): Individual.objects.filter(id__in=indiv_ids).update( population=population) if unknown_filter_flags: message = 'The following filter flags have no known corresponding value and were not saved: {}'.format( ', '.join(unknown_filter_flags)) logger.info(message) warnings.append(message) if unknown_pop_filter_flags: message = 'The following population platform filters have no known corresponding value and were not saved: {}'.format( ', '.join(unknown_pop_filter_flags)) logger.info(message) warnings.append(message) message = 'Found and updated matching seqr individuals for {} samples'.format( len(json_records) - len(missing_sample_ids)) info.append(message) logger.info(message) return create_json_response({ 'errors': [], 'warnings': warnings, 'info': info, })
def handle(self, *args, **options): analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS # parse and validate args sample_type = options["sample_type"] genome_version = options["genome_version"] validate_only = options["validate_only"] remap_sample_ids = options["remap_sample_ids"] max_edit_distance = options["max_edit_distance_for_id_match"] pedigree_file_path = options["pedigree_file"] export_pedigree_file_template = options["export_pedigree_file_template"] project_guid = options["project_id"] vcf_path = options["vcf_path"] elasticsearch_index = options["elasticsearch_index"] is_loaded = options["is_loaded"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) #if project.genome_version != genome_version: # raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version)) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals()) # parse the pedigree file if specified if pedigree_file_path: input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records) # validate VCF and get sample ids vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version) if remap_sample_ids: if not does_file_exist(remap_sample_ids): raise ValueError("File not found: " + remap_sample_ids) id_mapping = {} for line in file_iter(remap_sample_ids): fields = line.strip().split("\t") if len(fields) != 2: raise ValueError("Must contain 2 columns: " + str(fields)) id_mapping[fields[0]] = fields[1] remapped_vcf_sample_ids = [] for sample_id in vcf_sample_ids: if sample_id in id_mapping: remapped_vcf_sample_ids.append(id_mapping[sample_id]) print("Remapped %s to %s" % (sample_id, id_mapping[sample_id])) else: remapped_vcf_sample_ids.append(sample_id) print("No sample id mapping for %s" % sample_id) vcf_sample_ids = remapped_vcf_sample_ids vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records( project, sample_ids=vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_sample_records=not validate_only, ) if export_pedigree_file_template: with open(export_pedigree_file_template, "w") as out_f: out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],))) for vcf_sample_id in vcf_sample_ids: if vcf_sample_id in vcf_sample_ids_to_sample_records: continue family_id = individual_id = vcf_sample_id out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],))) logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals()) return if len(vcf_sample_ids_to_sample_records) == 0: all_vcf_sample_id_count = len(vcf_sample_ids) all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals()) return # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_elasticsearch_dataset( project=project, analysis_type=analysis_type, genome_version=genome_version, source_file_path=vcf_path, elasticsearch_index=elasticsearch_index, is_loaded=is_loaded, ) if is_loaded and not dataset.loaded_date: dataset.loaded_date=timezone.now() dataset.save() link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values()) # check if all VCF samples loaded already vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return elif not dataset.is_loaded: logger.info("Dataset not loaded. %s Loading..." % (is_loaded,)) elif len(vcf_sample_ids - existing_sample_ids) != 0: logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, )) logger.info("done")
def handle(self, *args, **options): analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS # parse and validate args sample_type = options["sample_type"] genome_version = options["genome_version"] validate_only = options["validate_only"] max_edit_distance = options["max_edit_distance_for_id_match"] pedigree_file_path = options["pedigree_file"] export_pedigree_file_template = options["export_pedigree_file_template"] project_guid = options["project_id"] vcf_path = options["vcf_path"] dataset_id = options["dataset_id"] # look up project id and validate other args try: project = Project.objects.get(guid=project_guid) except ObjectDoesNotExist: raise CommandError("Invalid project id: %(project_guid)s" % locals()) if project.genome_version != genome_version: raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version)) if pedigree_file_path and not os.path.isfile(pedigree_file_path): raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals()) # parse the pedigree file if specified if pedigree_file_path: input_stream = file_iter(pedigree_file_path) json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream) if errors: for message in errors: logger.error(message) raise CommandError("Unable to parse %(pedigree_file_path)s" % locals()) if warnings: for message in warnings: logger.warn(message) if not validate_only: add_or_update_individuals_and_families(project, json_records) # validate VCF and get sample ids vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version) vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records( project, sample_ids=vcf_sample_ids, sample_type=sample_type, max_edit_distance=max_edit_distance, create_records_for_new_sample_ids=not validate_only, ) if export_pedigree_file_template: with open(export_pedigree_file_template, "w") as out_f: out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],))) for vcf_sample_id in vcf_sample_ids: if vcf_sample_id in vcf_sample_ids_to_sample_records: continue family_id = individual_id = vcf_sample_id out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],))) logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals()) return if len(vcf_sample_ids_to_sample_records) == 0: all_vcf_sample_id_count = len(vcf_sample_ids) all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type)) logger.info(("No matches found between the %(all_vcf_sample_id_count)s sample id(s) in the VCF and " "the %(all_project_sample_id_count)s %(sample_type)s sample id(s) in %(project_guid)s") % locals()) return if validate_only: return # retrieve or create Dataset record and link it to sample(s) dataset = get_or_create_dataset( analysis_type=analysis_type, source_file_path=vcf_path, project=project, dataset_id=dataset_id, ) link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values()) # check if all VCF samples loaded already vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys()) existing_sample_ids = set([s.sample_id for s in dataset.samples.all()]) if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0: logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids)) return # load the VCF _load_variants(dataset) logger.info("done")
def load_mapping_file(mapping_file_path, user): file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user)) return _load_mapping_file(file_content)
def upload_qc_pipeline_output(request): file_path = json.loads(request.body)['file'] raw_records = parse_file(file_path, file_iter(file_path)) json_records = [dict(zip(raw_records[0], row)) for row in raw_records[1:]] try: dataset_type, data_type, records_by_sample_id = _parse_raw_qc_records(json_records) except ValueError as e: return create_json_response({'errors': [str(e)]}, status=400, reason=str(e)) info_message = 'Parsed {} {} samples'.format( len(json_records), 'SV' if dataset_type == Sample.DATASET_TYPE_SV_CALLS else data_type) logger.info(info_message) info = [info_message] warnings = [] samples = Sample.objects.filter( sample_id__in=records_by_sample_id.keys(), sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS, dataset_type=dataset_type, ).exclude( individual__family__project__name__in=EXCLUDE_PROJECTS ).exclude(individual__family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY) sample_individuals = { agg['sample_id']: agg['individuals'] for agg in samples.values('sample_id').annotate(individuals=ArrayAgg('individual_id', distinct=True)) } sample_individual_max_loaded_date = { agg['individual_id']: agg['max_loaded_date'] for agg in samples.values('individual_id').annotate(max_loaded_date=Max('loaded_date')) } individual_latest_sample_id = { s.individual_id: s.sample_id for s in samples if s.loaded_date == sample_individual_max_loaded_date.get(s.individual_id) } for sample_id, record in records_by_sample_id.items(): record['individual_ids'] = list({ individual_id for individual_id in sample_individuals.get(sample_id, []) if individual_latest_sample_id[individual_id] == sample_id }) missing_sample_ids = {sample_id for sample_id, record in records_by_sample_id.items() if not record['individual_ids']} if missing_sample_ids: individuals = Individual.objects.filter(individual_id__in=missing_sample_ids).exclude( family__project__name__in=EXCLUDE_PROJECTS).exclude( family__project__projectcategory__name=EXCLUDE_PROJECT_CATEGORY).filter( sample__sample_type=Sample.SAMPLE_TYPE_WES if data_type == 'exome' else Sample.SAMPLE_TYPE_WGS).distinct() individual_db_ids_by_id = defaultdict(list) for individual in individuals: individual_db_ids_by_id[individual.individual_id].append(individual.id) for sample_id, record in records_by_sample_id.items(): if not record['individual_ids'] and len(individual_db_ids_by_id[sample_id]) >= 1: record['individual_ids'] = individual_db_ids_by_id[sample_id] missing_sample_ids.remove(sample_id) multi_individual_samples = { sample_id: len(record['individual_ids']) for sample_id, record in records_by_sample_id.items() if len(record['individual_ids']) > 1} if multi_individual_samples: logger.info('Found {} multi-individual samples from qc output'.format(len(multi_individual_samples))) warnings.append('The following {} samples were added to multiple individuals: {}'.format( len(multi_individual_samples), ', '.join( sorted(['{} ({})'.format(sample_id, count) for sample_id, count in multi_individual_samples.items()])))) if missing_sample_ids: logger.info('Missing {} samples from qc output'.format(len(missing_sample_ids))) warnings.append('The following {} samples were skipped: {}'.format( len(missing_sample_ids), ', '.join(sorted(list(missing_sample_ids))))) records_with_individuals = [ record for sample_id, record in records_by_sample_id.items() if sample_id not in missing_sample_ids ] if dataset_type == Sample.DATASET_TYPE_SV_CALLS: _update_individuals_sv_qc(records_with_individuals, request.user) else: _update_individuals_variant_qc(records_with_individuals, data_type, warnings, request.user) message = 'Found and updated matching seqr individuals for {} samples'.format(len(json_records) - len(missing_sample_ids)) info.append(message) logger.info(message) return create_json_response({ 'errors': [], 'warnings': warnings, 'info': info, })
def load_mapping_file(mapping_file_path): file_content = parse_file(mapping_file_path, file_utils.file_iter(mapping_file_path)) return _load_mapping_file(file_content)
def _validate_vcf_metadata(vcf_path): metadata = '\n'.join([line for line in file_utils.file_iter(vcf_path)]) if 'sample_annotations' not in json.loads(metadata): raise Exception('No samples found in "{}"'.format(vcf_path))
def _validate_vcf_metadata(vcf_path): metadata = '\n'.join([line for line in file_utils.file_iter(vcf_path)]) if 'sample_annotations' not in json.loads(metadata): raise Exception('No samples found in "{}"'.format(vcf_path))
def load_mapping_file(mapping_file_path): file_content = parse_file(mapping_file_path, file_utils.file_iter(mapping_file_path)) return _load_mapping_file(file_content)