def _get_saved_variants_by_family(projects, user): tag_type = VariantTagType.objects.get(name='Known gene for phenotype') project_saved_variants = SavedVariant.objects.select_related( 'family', 'project').filter( project__in=projects, varianttag__variant_tag_type=tag_type, ) individuals = Individual.objects.filter(family__project__in=projects).only( 'guid', 'individual_id') individual_guids_by_id = {i.individual_id: i.guid for i in individuals} project_saved_variants_json = get_json_for_saved_variants( project_saved_variants, add_tags=True, add_details=True, user=user, individual_guids_by_id=individual_guids_by_id) saved_variants_by_family = defaultdict(list) for variant in project_saved_variants_json: for family_guid in variant['familyGuids']: saved_variants_by_family[family_guid].append(variant) return saved_variants_by_family
def _get_saved_variants(variants): if not variants: return {} variant_q = Q() for variant in variants: variant_q |= Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt'], family__guid__in=variant['familyGuids']) saved_variants = SavedVariant.objects.filter(variant_q) variants_by_id = { '{}-{}-{}'.format(var['xpos'], var['ref'], var['alt']): var for var in variants } saved_variants_json = get_json_for_saved_variants(saved_variants, add_tags=True) saved_variants_by_guid = {} for saved_variant in saved_variants_json: family_guids = saved_variant['familyGuids'] saved_variant.update(variants_by_id['{}-{}-{}'.format( saved_variant['xpos'], saved_variant['ref'], saved_variant['alt'])]) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids saved_variants_by_guid[saved_variant['variantGuid']] = saved_variant return saved_variants_by_guid
def _get_json_for_variant_tag_types(project, user, individuals_by_guid): individual_guids_by_id = { individual['individualId']: individual_guid for individual_guid, individual in individuals_by_guid.items() } tag_counts_by_type_and_family = VariantTag.objects.filter(saved_variant__project=project).values('saved_variant__family__guid', 'variant_tag_type__name').annotate(count=Count('*')) note_counts_by_family = VariantNote.objects.filter(saved_variant__project=project).values('saved_variant__family__guid').annotate(count=Count('*')) project_variant_tags = get_project_variant_tag_types(project, tag_counts_by_type_and_family=tag_counts_by_type_and_family, note_counts_by_family=note_counts_by_family) discovery_tags = [] for tag_type in project_variant_tags: if tag_type['category'] == 'CMG Discovery Tags' and tag_type['numTags'] > 0: tags = VariantTag.objects.filter(saved_variant__project=project, variant_tag_type__guid=tag_type['variantTagTypeGuid']).select_related('saved_variant') saved_variants = [tag.saved_variant for tag in tags] discovery_tags += get_json_for_saved_variants( saved_variants, add_tags=True, add_details=True, project=project, user=user, individual_guids_by_id=individual_guids_by_id) project_functional_tags = [] for category, tags in VariantFunctionalData.FUNCTIONAL_DATA_CHOICES: project_functional_tags += [{ 'category': category, 'name': name, 'metadataTitle': json.loads(tag_json).get('metadata_title'), 'color': json.loads(tag_json)['color'], 'description': json.loads(tag_json).get('description'), } for name, tag_json in tags] return { 'variantTagTypes': sorted(project_variant_tags, key=lambda variant_tag_type: variant_tag_type['order']), 'variantFunctionalTagTypes': get_json_for_variant_functional_data_tag_types(), 'discoveryTags': discovery_tags, }
def get_individual_mme_matches(request, submission_guid): """ Looks for matches for the given submission. Expects a single patient (MME spec) in the POST data field under key "patient_data" Args: project_id,indiv_id and POST all data in POST under key "patient_data" Returns: Status code and results """ submission = MatchmakerSubmission.objects.get(guid=submission_guid) check_mme_permissions(submission, request.user) results = MatchmakerResult.objects.filter(submission=submission) saved_variants = get_json_for_saved_variants( SavedVariant.objects.filter(family=submission.individual.family), add_tags=True, add_details=True) gene_ids = set() for variant in saved_variants: gene_ids.update(variant['transcripts'].keys()) return _parse_mme_results( submission, results, request.user, additional_genes=gene_ids, response_json={ 'savedVariantsByGuid': {variant['variantGuid']: variant for variant in saved_variants} })
def saved_variant_data(request, project_guid, variant_guid=None): project = get_project_and_check_permissions(project_guid, request.user) family_guids = request.GET['families'].split(',') if request.GET.get( 'families') else None if family_guids: variant_query = SavedVariant.objects.filter( family__guid__in=family_guids) else: variant_query = SavedVariant.objects.filter(family__project=project) if variant_guid: variant_query = variant_query.filter(guid=variant_guid) if variant_query.count() < 1: return create_json_response( {}, status=404, reason='Variant {} not found'.format(variant_guid)) saved_variants = get_json_for_saved_variants(variant_query, add_tags=True, add_details=True) variants = { variant['variantGuid']: variant for variant in saved_variants if variant['notes'] or variant['tags'] } genes = _saved_variant_genes(variants.values()) _add_locus_lists([project], variants.values(), genes) return create_json_response({ 'savedVariantsByGuid': variants, 'genesById': genes, })
def saved_variants(request, tag): tag_type = VariantTagType.objects.get(name=tag, project__isnull=True) saved_variant_models = SavedVariant.objects.filter(varianttag__variant_tag_type=tag_type, family__isnull=False) saved_variants = get_json_for_saved_variants(saved_variant_models, add_tags=True, add_details=True, user=request.user) project_models_by_guid = {variant.project.guid: variant.project for variant in saved_variant_models} families = {variant.family for variant in saved_variant_models} individuals = Individual.objects.filter(family__in=families) genes = _saved_variant_genes(saved_variants) locus_list_guids = _add_locus_lists(project_models_by_guid.values(), saved_variants, genes) projects_json = get_json_for_projects(project_models_by_guid.values(), user=request.user, add_project_category_guids_field=False) functional_tag_types = get_json_for_variant_functional_data_tag_types() for project_json in projects_json: project_json.update({ 'locusListGuids': locus_list_guids, 'variantTagTypes': get_project_variant_tag_types(project_models_by_guid[project_json['projectGuid']]), 'variantFunctionalTagTypes': functional_tag_types, }) families_json = _get_json_for_families(list(families), user=request.user, add_individual_guids_field=True) individuals_json = _get_json_for_individuals(individuals, user=request.user) locus_lists_by_guid = {locus_list['locusListGuid']: locus_list for locus_list in get_json_for_locus_lists(LocusList.objects.filter(guid__in=locus_list_guids), request.user)} return create_json_response({ 'savedVariantsByGuid': {variant['variantGuid']: variant for variant in saved_variants}, 'genesById': genes, 'projectsByGuid': {project['projectGuid']: project for project in projects_json}, 'familiesByGuid': {family['familyGuid']: family for family in families_json}, 'individualsByGuid': {indiv['individualGuid']: indiv for indiv in individuals_json}, 'locusListsByGuid': locus_lists_by_guid, })
def _get_saved_discovery_variants_by_family(variant_filter, parse_json=False): tag_types = VariantTagType.objects.filter(project__isnull=True, category='CMG Discovery Tags') project_saved_variants = SavedVariant.objects.select_related('family').prefetch_related( Prefetch('varianttag_set', to_attr='discovery_tags', queryset=VariantTag.objects.filter(variant_tag_type__in=tag_types).select_related('variant_tag_type'), )).prefetch_related('variantfunctionaldata_set').filter( varianttag__variant_tag_type__in=tag_types, **variant_filter ).order_by('created_date').distinct() if parse_json: variant_by_guid = {variant['variantGuid']: variant for variant in get_json_for_saved_variants(project_saved_variants, add_details=True)} saved_variants_by_family = defaultdict(list) for saved_variant in project_saved_variants: parsed_variant = saved_variant if parse_json: parsed_variant = variant_by_guid[saved_variant.guid] parsed_variant['discovery_tag_guids_by_name'] = {vt.variant_tag_type.name: vt.guid for vt in saved_variant.discovery_tags} saved_variants_by_family[saved_variant.family.guid].append(parsed_variant) return saved_variants_by_family
def _get_saved_variants(variants, families): if not variants: return {}, {} prefetch_related_objects(families, 'project') hg37_family_guids = { family.guid for family in families if family.project.genome_version == GENOME_VERSION_GRCh37 } variant_q = Q() variants_by_id = {} for variant in variants: variants_by_id[_get_variant_key(**variant)] = variant variant_q |= Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt'], family__guid__in=variant['familyGuids']) if variant[ 'liftedOverGenomeVersion'] == GENOME_VERSION_GRCh37 and hg37_family_guids: variant_hg37_families = [ family_guid for family_guid in variant['familyGuids'] if family_guid in hg37_family_guids ] if variant_hg37_families: lifted_xpos = get_xpos(variant['liftedOverChrom'], variant['liftedOverPos']) variant_q |= Q(xpos_start=lifted_xpos, ref=variant['ref'], alt=variant['alt'], family__guid__in=variant_hg37_families) variants_by_id[_get_variant_key( xpos=lifted_xpos, ref=variant['ref'], alt=variant['alt'], genomeVersion=variant['liftedOverGenomeVersion'] )] = variant saved_variants = SavedVariant.objects.filter(variant_q) saved_variants_json = get_json_for_saved_variants(saved_variants, add_tags=True, add_details=True) saved_variants_by_guid = {} variants_to_saved_variants = {} for saved_variant in saved_variants_json: family_guids = saved_variant['familyGuids'] searched_variant = variants_by_id[_get_variant_key(**saved_variant)] saved_variant.update(searched_variant) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids saved_variants_by_guid[saved_variant['variantGuid']] = saved_variant if searched_variant['variantId'] not in variants_to_saved_variants: variants_to_saved_variants[searched_variant['variantId']] = {} for family_guid in family_guids: variants_to_saved_variants[searched_variant['variantId']][ family_guid] = saved_variant['variantGuid'] return saved_variants_by_guid, variants_to_saved_variants
def _get_json_for_variant_tag_types(project): note_counts_by_family = VariantNote.objects.filter(saved_variants__family__project=project)\ .values('saved_variants__family__guid').annotate(count=Count('*')) num_tags = sum(count['count'] for count in note_counts_by_family) note_tag_type = { 'variantTagTypeGuid': 'notes', 'name': 'Has Notes', 'category': 'Notes', 'description': '', 'color': 'grey', 'order': 100, 'numTags': num_tags, 'numTagsPerFamily': {count['saved_variants__family__guid']: count['count'] for count in note_counts_by_family}, } tag_counts_by_type_and_family = VariantTag.objects.filter(saved_variants__family__project=project)\ .values('saved_variants__family__guid', 'variant_tag_type__name').annotate(count=Count('*')) project_variant_tags = _get_json_for_models(VariantTagType.objects.filter(Q(project=project) | Q(project__isnull=True))) for tag_type in project_variant_tags: current_tag_type_counts = [counts for counts in tag_counts_by_type_and_family if counts['variant_tag_type__name'] == tag_type['name']] num_tags = sum(count['count'] for count in current_tag_type_counts) tag_type.update({ 'numTags': num_tags, 'numTagsPerFamily': {count['saved_variants__family__guid']: count['count'] for count in current_tag_type_counts}, }) project_variant_tags.append(note_tag_type) project_variant_tags = sorted(project_variant_tags, key=lambda variant_tag_type: variant_tag_type['order']) discovery_tag_type_guids = [tag_type['variantTagTypeGuid'] for tag_type in project_variant_tags if tag_type['category'] == 'CMG Discovery Tags' and tag_type['numTags'] > 0] discovery_tags = get_json_for_saved_variants(SavedVariant.objects.filter( family__project=project, varianttag__variant_tag_type__guid__in=discovery_tag_type_guids, ), add_details=True) project_functional_tags = [] for category, tags in VariantFunctionalData.FUNCTIONAL_DATA_CHOICES: project_functional_tags += [{ 'category': category, 'name': name, 'metadataTitle': json.loads(tag_json).get('metadata_title'), 'color': json.loads(tag_json)['color'], 'description': json.loads(tag_json).get('description'), } for name, tag_json in tags] return { 'variantTagTypes': sorted(project_variant_tags, key=lambda variant_tag_type: variant_tag_type['order']), 'variantFunctionalTagTypes': get_json_for_variant_functional_data_tag_types(), 'discoveryTags': discovery_tags, }
def _get_saved_known_gene_variants_by_family(projects): tag_type = VariantTagType.objects.get(name='Known gene for phenotype') project_saved_variants = SavedVariant.objects.select_related( 'family').filter( family__project__in=projects, varianttag__variant_tag_type=tag_type, ) project_saved_variants_json = get_json_for_saved_variants( project_saved_variants, add_details=True) saved_variants_by_family = defaultdict(list) for variant in project_saved_variants_json: for family_guid in variant['familyGuids']: saved_variants_by_family[family_guid].append(variant) return saved_variants_by_family
def _get_saved_variants_by_family(projects, user): tag_type = VariantTagType.objects.get(name='Known gene for phenotype') project_saved_variants = SavedVariant.objects.select_related('family', 'project').filter( project__in=projects, varianttag__variant_tag_type=tag_type, ) individuals = Individual.objects.filter(family__project__in=projects).only('guid', 'individual_id') individual_guids_by_id = {i.individual_id: i.guid for i in individuals} project_saved_variants_json = get_json_for_saved_variants( project_saved_variants, add_tags=True, add_details=True, user=user, individual_guids_by_id=individual_guids_by_id) saved_variants_by_family = defaultdict(list) for variant in project_saved_variants_json: for family_guid in variant['familyGuids']: saved_variants_by_family[family_guid].append(variant) return saved_variants_by_family
def _get_saved_variants(variants): if not variants: return {} variant_q = Q() for variant in variants: variant_q |= Q(xpos_start=variant['xpos'], ref=variant['ref'], alt=variant['alt'], family__guid__in=variant['familyGuids']) saved_variants = SavedVariant.objects.filter(variant_q) variants_by_id = {'{}-{}-{}'.format(var['xpos'], var['ref'], var['alt']): var for var in variants} saved_variants_json = get_json_for_saved_variants(saved_variants, add_tags=True) saved_variants_by_guid = {} for saved_variant in saved_variants_json: family_guids = saved_variant['familyGuids'] saved_variant.update( variants_by_id['{}-{}-{}'.format(saved_variant['xpos'], saved_variant['ref'], saved_variant['alt'])] ) # For saved variants only use family it was saved for, not all families in search saved_variant['familyGuids'] = family_guids saved_variants_by_guid[saved_variant['variantGuid']] = saved_variant return saved_variants_by_guid
def saved_variant_data(request, project_guid, variant_guid=None): project = get_project_and_check_permissions(project_guid, request.user) family_guids = request.GET['families'].split(',') if request.GET.get('families') else None variant_query = SavedVariant.objects.filter(project=project) if family_guids: variant_query = variant_query.filter(family__guid__in=family_guids) if variant_guid: variant_query = variant_query.filter(guid=variant_guid) if variant_query.count() < 1: return create_json_response({}, status=404, reason='Variant {} not found'.format(variant_guid)) individual_guids_by_id = {i.individual_id: i.guid for i in Individual.objects.filter(family__project=project)} saved_variants = get_json_for_saved_variants(variant_query, add_tags=True, add_details=True, project=project, user=request.user, individual_guids_by_id=individual_guids_by_id) variants = {variant['variantGuid']: variant for variant in saved_variants if variant['notes'] or variant['tags']} genes = _saved_variant_genes(variants.values()) _add_locus_lists([project], variants.values(), genes) return create_json_response({ 'savedVariantsByGuid': variants, 'genesById': genes, })
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get( Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format( project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples( elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set( matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise CommandError( 'Matches not found for ES sample ids: {}.'.format( ', '.join(unmatched_samples))) prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family') included_families = { sample.individual.family for sample in matched_sample_id_to_sample_record.values() } missing_individuals = Individual.objects.filter( family__in=included_families, sample__is_active=True, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, ).exclude(sample__in=matched_sample_id_to_sample_record.values() ).select_related('family') missing_family_individuals = defaultdict(list) for individual in missing_individuals: missing_family_individuals[individual.family].append(individual) if missing_family_individuals: raise CommandError( 'The following families are included in the callset but are missing some family members: {}.' .format(', '.join([ '{} ({})'.format( family.family_id, ', '.join([i.individual_id for i in missing_indivs])) for family, missing_indivs in missing_family_individuals.items() ]))) # Get and clean up expected saved variants saved_variant_models_by_guid = { v.guid: v for v in SavedVariant.objects.filter(family__project=project) } deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_tags: if raw_input( 'Do you want to delete the following {} saved variants with no tags (y/n)?: {} ' .format(len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = { sv.family for sv in saved_variant_models_by_guid.values() } missing_families = expected_families - included_families if missing_families: raise CommandError( 'The following families have saved variants but are missing from the callset: {}.' .format(', '.join([f.family_id for f in missing_families]))) # Lift-over saved variants _update_variant_samples(matched_sample_id_to_sample_record, elasticsearch_index, dataset_path) saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True) saved_variants_to_lift = [ v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38 ] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input( 'Found {} saved variants already on Hg38. Continue with liftover (y/n)? ' .format(num_already_lifted)) != 'y': raise CommandError( 'Error: found {} saved variants already on Hg38'.format( num_already_lifted)) logger.info( 'Lifting over {} variants (skipping {} that are already lifted)'. format(len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = {} for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate( 'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos( hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed[v['xpos']] = v if lift_failed: if raw_input( 'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} ' .format( len(lift_failed), ', '.join([ '{}:{}-{}-{} ({})'.format( v['chrom'], v['pos'], v['ref'], v['alt'], ', '.join(v['familyGuids'])) for v in lift_failed.values() ]))) != 'y': raise CommandError( 'Error: unable to lift over {} variants'.format( len(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: if hg37_to_hg38_xpos.get(v['xpos']): variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples( expected_families, saved_variants_map.keys()) missing_variants = set( saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = [] for xpos, ref, alt in missing_variants: var_id = '{}-{}-{}'.format(xpos, ref, alt) for v in saved_variants_map[(xpos, ref, alt)]: tags = v.varianttag_set.all() notes = v.variantnote_set.all() missing_variant_strings.append( '{var_id} {family_id}: {tags} ({guid})'.format( var_id=var_id, family_id=v.family.family_id, guid=v.guid, tags=', '.join([ tag.variant_tag_type.name for tag in tags ]) if tags else 'No Tags; {}'.format('; '.join( [note.note for note in notes])))) if raw_input( 'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n' .format(len(missing_variants), '\n'.join(missing_variant_strings))) != 'y': raise CommandError( 'Error: unable to find {} lifted-over variants'.format( len(missing_variants))) logger.info('Successfully lifted over {} variants'.format( len(es_variants))) # Update saved variants missing_family_count = 0 for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_saved_variants = [ v for v in saved_variant_models if v.family.guid not in var['familyGuids'] ] if missing_saved_variants: variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'], var['ref'], var['alt']) if raw_input( ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? ' .format( variant_id, missing_saved_variants[0].xpos, ', '.join([ '{} ({})'.format(v.family.guid, v.guid) for v in missing_saved_variants ])))) == 'y': var = get_single_es_variant( [v.family for v in saved_variant_models], variant_id, return_all_queried_families=True) missing_family_count += len(missing_saved_variants) else: raise CommandError( 'Error: unable to find family data for lifted over variant' ) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = var saved_variant.save() logger.info('Successfully updated {} variants'.format( len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38}) reset_cached_search_results(project) logger.info('---Done---') logger.info( 'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants' .format(len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count))
def saved_variants_page(request, tag): gene = request.GET.get('gene') tag_type = VariantTagType.objects.get(name=tag, project__isnull=True) saved_variant_models = SavedVariant.objects.filter( varianttag__variant_tag_type=tag_type) if gene: saved_variant_models = saved_variant_models.filter( saved_variant_json__transcripts__has_key=gene) if saved_variant_models.count() > 10000 and not gene: return create_json_response( {'message': 'Select a gene to filter variants'}, status=400) prefetch_related_objects(saved_variant_models, 'family__project') saved_variants = get_json_for_saved_variants(saved_variant_models, add_tags=True, add_details=True) project_models_by_guid = { variant.family.project.guid: variant.family.project for variant in saved_variant_models } families = {variant.family for variant in saved_variant_models} individuals = Individual.objects.filter(family__in=families) genes = _saved_variant_genes(saved_variants) locus_list_guids = _add_locus_lists(project_models_by_guid.values(), saved_variants, genes) projects_json = get_json_for_projects( project_models_by_guid.values(), user=request.user, add_project_category_guids_field=False) functional_tag_types = get_json_for_variant_functional_data_tag_types() variant_tag_types = VariantTagType.objects.filter( Q(project__in=project_models_by_guid.values()) | Q(project__isnull=True)) prefetch_related_objects(variant_tag_types, 'project') variant_tags_json = _get_json_for_models(variant_tag_types) tag_projects = { vt.guid: vt.project.guid for vt in variant_tag_types if vt.project } for project_json in projects_json: project_guid = project_json['projectGuid'] project_variant_tags = [ vt for vt in variant_tags_json if tag_projects.get( vt['variantTagTypeGuid'], project_guid) == project_guid ] project_json.update({ 'locusListGuids': locus_list_guids, 'variantTagTypes': sorted(project_variant_tags, key=lambda variant_tag_type: variant_tag_type['order']), 'variantFunctionalTagTypes': functional_tag_types, }) families_json = _get_json_for_families(list(families), user=request.user, add_individual_guids_field=True) individuals_json = _get_json_for_individuals(individuals, user=request.user) locus_lists_by_guid = { locus_list['locusListGuid']: locus_list for locus_list in get_json_for_locus_lists( LocusList.objects.filter(guid__in=locus_list_guids), request.user) } return create_json_response({ 'savedVariantsByGuid': {variant['variantGuid']: variant for variant in saved_variants}, 'genesById': genes, 'projectsByGuid': {project['projectGuid']: project for project in projects_json}, 'familiesByGuid': {family['familyGuid']: family for family in families_json}, 'individualsByGuid': {indiv['individualGuid']: indiv for indiv in individuals_json}, 'locusListsByGuid': locus_lists_by_guid, })
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get(Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format(project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples(elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set(matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise Exception('Matches not found for ES sample ids: {}.'.format(', '.join(unmatched_samples))) included_family_individuals = defaultdict(set) individual_guids_by_id = {} for sample in matched_sample_id_to_sample_record.values(): included_family_individuals[sample.individual.family].add(sample.individual.individual_id) individual_guids_by_id[sample.individual.individual_id] = sample.individual.guid missing_family_individuals = [] for family, individual_ids in included_family_individuals.items(): missing_indivs = family.individual_set.filter( sample__sample_status=Sample.SAMPLE_STATUS_LOADED, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS ).exclude(individual_id__in=individual_ids) if missing_indivs: missing_family_individuals.append( '{} ({})'.format(family.family_id, ', '.join([i.individual_id for i in missing_indivs])) ) if missing_family_individuals: raise Exception( 'The following families are included in the callset but are missing some family members: {}.'.format( ', '.join(missing_family_individuals) )) # Get and clean up expected saved variants saved_variant_models_by_guid = {v.guid: v for v in SavedVariant.objects.filter(project=project)} deleted_no_family = set() deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not variant.family: deleted_no_family.add(guid) elif not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_family: if raw_input('Do you want to delete the following {} saved variants with no family (y/n)?: {} '.format( len(deleted_no_family), ', '.join(deleted_no_family))) == 'y': for guid in deleted_no_family: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_family))) if deleted_no_tags: if raw_input('Do you want to delete the following {} saved variants with no tags (y/n)?: {} '.format( len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = {sv.family for sv in saved_variant_models_by_guid.values()} missing_families = expected_families - set(included_family_individuals.keys()) if missing_families: raise Exception( 'The following families have saved variants but are missing from the callset: {}.'.format( ', '.join([f.family_id for f in missing_families]) )) # Lift-over saved variants saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True, project=project, individual_guids_by_id=individual_guids_by_id) saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input('Found {} saved variants already on Hg38. Continue with liftover (y/n)?'.format(num_already_lifted)) != 'y': raise Exception('Error: found {} saved variants already on Hg38'.format(num_already_lifted)) logger.info('Lifting over {} variants (skipping {} that are already lifted)'.format( len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = set() for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate('chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos(hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed.add(v['xpos']) if lift_failed: raise Exception( 'Unable to lift over the following {} coordinates: {}'.format(len(lift_failed), ', '.join(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples(expected_families, saved_variants_map.keys()) missing_variants = set(saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = ['{}-{}-{} ({})'.format( xpos, ref, alt, ', '.join(['{}: {}'.format(v.family.family_id, v.guid) for v in saved_variants_map[(xpos, ref, alt)]])) for xpos, ref, alt in missing_variants] if raw_input('Unable to find the following {} variants in the index. Continue with update (y/n)?: {} '.format( len(missing_variants), ', '.join(missing_variant_strings))) != 'y': raise Exception('Error: unable to find {} lifted-over variants'.format(len(missing_variants))) logger.info('Successfully lifted over {} variants'.format(len(es_variants))) # Update saved variants for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_families = [v.family.guid for v in saved_variant_models if v.family.guid not in var['familyGuids']] if missing_families: raise Exception('Error with variant {}:{}-{}-{} not find for expected families {}; found in families {}'.format( var['chrom'], var['pos'], var['ref'], var['alt'], ', '.join(missing_families), ', '.join(var['familyGuids']) )) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = json.dumps(var) saved_variant.save() logger.info('Successfully updated {} variants'.format(len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38, 'has_new_search': True}) _update_samples( matched_sample_id_to_sample_record, elasticsearch_index=elasticsearch_index, dataset_path=dataset_path ) update_xbrowse_vcfffiles( project, sample_type, elasticsearch_index, dataset_path, matched_sample_id_to_sample_record ) reset_cached_search_results(project) logger.info('---Done---') logger.info('Succesfully lifted over {} variants. Skipped {} failed variants.'.format( len(es_variants), len(missing_variants)))