Ejemplo n.º 1
0
    def update_annotator_variants_table(self):
        """Updates all db.variants population frequencies based on population_frequency"""

        population_frequency_store = mall.get_annotator().get_population_frequency_store()
        population_slugs_to_load = [
            population_spec["slug"] for population_spec in annotator_settings.reference_populations
        ]

        annotator_store = mall.get_annotator().get_annotator_datastore()

        counter = 0
        for variant_dict in annotator_store.variants.find():
            counter += 1
            if counter % 10000 == 0:
                print("%s: %s processed" % (datetime.datetime.now(), counter))

            freqs = population_frequency_store.get_frequencies(
                variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"]
            )
            full_freqs = {
                "annotation.freqs." + population_slug: freqs.get(population_slug, 0)
                for population_slug in population_slugs_to_load
            }

            if sum(full_freqs.values()) > 0:
                # only update if atleast one of the freqs is > 0
                annotator_store.variants.update(
                    {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]},
                    {"$set": full_freqs},
                    upsert=False,
                )
Ejemplo n.º 2
0
    def update_annotator_variants_table(self):
        """Updates all db.variants population frequencies based on population_frequency"""

        population_frequency_store = mall.get_annotator(
        ).get_population_frequency_store()
        population_slugs_to_load = [
            population_spec['slug']
            for population_spec in annotator_settings.reference_populations
        ]

        annotator_store = mall.get_annotator().get_annotator_datastore()

        counter = 0
        for variant_dict in annotator_store.variants.find():
            counter += 1
            if counter % 10000 == 0:
                print("%s: %s processed" % (datetime.datetime.now(), counter))

            freqs = population_frequency_store.get_frequencies(
                variant_dict['xpos'], variant_dict['ref'], variant_dict['alt'])
            full_freqs = {
                'annotation.freqs.' + population_slug:
                freqs.get(population_slug, 0)
                for population_slug in population_slugs_to_load
            }

            if sum(full_freqs.values()) > 0:
                # only update if atleast one of the freqs is > 0
                annotator_store.variants.update(
                    {
                        'xpos': variant_dict['xpos'],
                        'ref': variant_dict['ref'],
                        'alt': variant_dict['alt']
                    }, {'$set': full_freqs},
                    upsert=False)
Ejemplo n.º 3
0
def load_project_variants_from_vcf(project_id, vcf_files):
    """
    Load any families and cohorts in this project that aren't loaded already
    """
    print("Called load_project_variants_from_vcf on " + str(vcf_files))
    print "Loading project %s" % project_id
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_file in vcf_files:
        r = vcf.VCFReader(filename=vcf_file)
        if "CSQ" in r.infos:
            mall.get_annotator().add_preannotated_vcf_file(vcf_file)
        else:
            mall.get_annotator().add_vcf_file_to_annotator(vcf_file)

    # batch load families by VCF file
    print("project.families_by_vcf(): " + str(project.families_by_vcf()))
    for vcf_file, families in project.families_by_vcf().items():
        if vcf_file not in vcf_files:
            print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals())
            continue

        #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        print("Loading families for VCF file: " + vcf_file)
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=True)
            print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- finished loading project: " + project_id))
Ejemplo n.º 4
0
def load_project_variants_from_vcf(project_id, vcf_files, mark_as_loaded=True, start_from_chrom=None, end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already

    Args:
       project_id: the project id as a string
       vcf_files: a list of one or more vcf file paths
    """
    project = Project.objects.get(project_id=project_id)

    for vcf_file in vcf_files:
        if not os.path.isfile(vcf_file):
            print("Skipping " + vcf_file)
            continue
        r = vcf.VCFReader(filename=vcf_file)
        if "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_file)

        if vcf_file in vcf_files:
            mall.get_annotator().add_preannotated_vcf_file(vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    print("project.families_by_vcf(): " + str(project.families_by_vcf()))
    for vcf_file, families in project.families_by_vcf().items():
        if vcf_file not in vcf_files:
            print("Skipping %(vcf_file)s since its not in %(vcf_files)s" % locals())
            continue

        #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        print("Loading families for VCF file: " + vcf_file)
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, mark_as_loaded=mark_as_loaded, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)
Ejemplo n.º 5
0
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in project.get_all_vcf_files():
        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_file)

        mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_annotations)
        

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) ))
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    # now load cohorts
    load_cohorts(project_id)
Ejemplo n.º 6
0
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already
    """
    print "Loading project %s" % project_id
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v:v.path()):
        if not os.path.isfile(vcf_obj.path()):
            print("Skipping " + vcf_obj.path())
            continue

        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path())

        mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)


    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        if not force_load_variants:
            # filter out families that have already finished loading
            families = [f for f in families if get_mall(project).variant_store.get_family_status(project_id, f.family_id) != 'loaded']

        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)

    # now load cohorts
    load_cohorts(project_id)
Ejemplo n.º 7
0
def get_variants_from_variant_tuples(project, variant_tuples, user=None):
    datastore = get_datastore(project)
    population_slugs = project.get_reference_population_slugs()

    variant_tuples_by_family_id = {}
    for xpos, ref, alt, family_id in variant_tuples:
        if family_id not in variant_tuples_by_family_id:
            variant_tuples_by_family_id[family_id] = []
        variant_tuples_by_family_id[family_id].append((xpos, ref, alt))

    variants = []
    for family_id, variant_tuples in variant_tuples_by_family_id.items():
        variants_for_family = datastore.get_multiple_variants(
            project.project_id,
            family_id,
            variant_tuples,
            user=user
        )
        for (xpos, ref, alt), variant in zip(variant_tuples, variants_for_family):
            if not variant:
                variant = Variant(xpos, ref, alt)
                get_annotator().annotate_variant(variant, population_slugs)
                variant.set_extra('created_variant', True)

            variant.set_extra('family_id', family_id)
            variant.set_extra('project_id', project.project_id)
            variants.append(variant)

    return variants
Ejemplo n.º 8
0
def get_variants_from_variant_tuples(project, variant_tuples, user=None):
    datastore = get_datastore(project)
    population_slugs = project.get_reference_population_slugs()

    variant_tuples_by_family_id = {}
    for xpos, ref, alt, family_id in variant_tuples:
        if family_id not in variant_tuples_by_family_id:
            variant_tuples_by_family_id[family_id] = []
        variant_tuples_by_family_id[family_id].append((xpos, ref, alt))

    variants = []
    for family_id, variant_tuples in variant_tuples_by_family_id.items():
        variants_for_family = datastore.get_multiple_variants(
            project.project_id, family_id, variant_tuples, user=user)
        for (xpos, ref, alt), variant in zip(variant_tuples,
                                             variants_for_family):
            if not variant:
                variant = Variant(xpos, ref, alt)
                get_annotator().annotate_variant(variant, population_slugs)
                variant.set_extra('created_variant', True)

            variant.set_extra('family_id', family_id)
            variant.set_extra('project_id', project.project_id)
            variants.append(variant)

    return variants
Ejemplo n.º 9
0
def load_project_variants(project_id,
                          force_load_annotations=False,
                          force_load_variants=False,
                          ignore_csq_in_vcf=False,
                          start_from_chrom=None,
                          end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()):
        if not os.path.isfile(vcf_obj.path()):
            print("Skipping " + vcf_obj.path())
            continue

        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " +
                             vcf_obj.path())

        mall.get_annotator().add_preannotated_vcf_file(
            vcf_obj.path(),
            force=force_load_annotations,
            start_from_chrom=start_from_chrom,
            end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        if not force_load_variants:
            # filter out families that have already finished loading
            families = [
                f for f in families
                if get_mall(project.project_id).variant_store.
                get_family_status(project_id, f.family_id) != 'loaded'
            ]

        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(
                date.strftime(
                    datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
                    project_id + " - families batch %d - %d families" %
                    (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)

    # now load cohorts
    load_cohorts(project_id)
Ejemplo n.º 10
0
def load_project_variants_from_vcf(project_id,
                                   vcf_files,
                                   mark_as_loaded=True,
                                   start_from_chrom=None,
                                   end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already
    
    Args:
       project_id: the project id as a string
       vcf_files: a list of one or more vcf file paths
    """
    print("Called load_project_variants_from_vcf on " + str(vcf_files))
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_file in vcf_files:
        if not os.path.isfile(vcf_file):
            print("Skipping " + vcf_file)
            continue
        r = vcf.VCFReader(filename=vcf_file)
        if "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_file)

        if vcf_file in vcf_files:
            mall.get_annotator().add_preannotated_vcf_file(
                vcf_file,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    print("project.families_by_vcf(): " + str(project.families_by_vcf()))
    for vcf_file, families in project.families_by_vcf().items():
        if vcf_file not in vcf_files:
            print("Skipping %(vcf_file)s since its not in %(vcf_files)s" %
                  locals())
            continue

        #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        print("Loading families for VCF file: " + vcf_file)
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                mark_as_loaded=mark_as_loaded,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)
            print(
                date.strftime(
                    datetime.now(),
                    "%m/%d/%Y %H:%M:%S  -- finished loading project: " +
                    project_id))
Ejemplo n.º 11
0
def add_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        try:
            mall.get_annotator().get_population_frequency_store(
            ).add_populations_to_variants(variants, population_slug_list)
        except Exception, e:
            print(
                "WARNING: got unexpected error in add_custom_populations_to_variants: %s"
                % e)
Ejemplo n.º 12
0
    def handle(self, *args, **options):

        if settings.CUSTOM_ANNOTATOR_SETTINGS is not None:
            print("Load dbNSFP.. ")

            # note that you could use mall.get_custom_annotator() here too
            custom_annotator = CustomAnnotator(settings.CUSTOM_ANNOTATOR_SETTINGS)
            custom_annotator.load()

        get_reference().load()
        mall.get_annotator().load()
Ejemplo n.º 13
0
    def handle(self, *args, **options):

        if settings.CUSTOM_ANNOTATOR_SETTINGS is not None:
            print("Load dbNSFP.. ")

            # note that you could use mall.get_custom_annotator() here too
            custom_annotator = CustomAnnotator(
                settings.CUSTOM_ANNOTATOR_SETTINGS)
            custom_annotator.load()

        get_reference().load()
        mall.get_annotator().load()
Ejemplo n.º 14
0
def get_variants_from_note_tuples(project, note_tuples):
    variants = []
    for note_t in note_tuples:
        variant = get_datastore(project.project_id).get_single_variant(
            project.project_id, note_t[3], note_t[0], note_t[1], note_t[2]
        )
        if not variant:
            variant = Variant(note_t[0], note_t[1], note_t[2])
            get_annotator().annotate_variant(variant, project.get_reference_population_slugs())
            # variant.annotation = get_annotator().get_variant(note_t[0], note_t[1], note_t[2])
        variant.set_extra("family_id", note_t[3])
        variant.set_extra("project_id", project.project_id)
        variants.append(variant)
    return variants
Ejemplo n.º 15
0
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(),
        [f.xfamily() for f in project.get_active_families()],
        gene_id,
        variant_filter,
        quality_filter
    )
    return matrix
Ejemplo n.º 16
0
def get_variants_from_variant_tuples(project, variant_tuples):
    variants = []
    for t in variant_tuples:
        variant = get_datastore(project.project_id).get_single_variant(
            project.project_id, t[3], t[0], t[1], t[2])
        if not variant:
            variant = Variant(t[0], t[1], t[2])
            get_annotator().annotate_variant(
                variant, project.get_reference_population_slugs())

        variant.set_extra('family_id', t[3])
        variant.set_extra('project_id', project.project_id)
        variants.append(variant)
    return variants
Ejemplo n.º 17
0
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter(
        'moderate_impact',
        mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter(
        'high_quality',
        mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(project.project_id),
        [f.xfamily() for f in project.get_active_families()], gene_id,
        variant_filter, quality_filter)
    return matrix
Ejemplo n.º 18
0
def family_group_gene(request, project_id, family_group_slug, gene_id):

    project = get_object_or_404(Project, project_id=project_id)
    family_group = get_object_or_404(FamilyGroup,
                                     project=project,
                                     slug=family_group_slug)
    if not project.can_view(request.user):
        return HttpResponse('unauthorized')

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    varfilter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)
    variants_by_family = family_group_analysis.get_variants_in_gene(
        family_group, gene_id, variant_filter=varfilter)

    return render(
        request, 'family_group/family_group_gene.html', {
            'project': project,
            'family_group': family_group,
            'family_group_json': json.dumps(family_group.toJSON()),
            'gene_json': json.dumps(gene),
            'gene': gene,
            'variants_by_family_json': json.dumps(variants_by_family),
        })
Ejemplo n.º 19
0
def get_knockouts_in_gene(project, gene_id, quality_filter=None):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]

    # filter out variants > 0.01 AF in any of the reference populations
    reference_populations = mall.get_annotator().reference_population_slugs
    variant_filter = get_default_variant_filter('moderate_impact',
                                                reference_populations)
    variant_list = get_project_datastore(
        project.project_id).get_project_variants_in_gene(
            project.project_id,
            gene_id,
            variant_filter=variant_filter,
        )
    variant_list = search_utils.filter_gene_variants_by_variant_filter(
        variant_list, gene_id, variant_filter)
    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation,
                                                 indiv_id_list)
    return knockouts, variation
Ejemplo n.º 20
0
    def handle(self, *args, **options):
        """load CADD scores for all variants in a project, or all variants in the annotator_store."""

        annotator_store = mall.get_annotator().get_annotator_datastore()
        if options['cadd_file']:
            print("Loading " + options['cadd_file'])
            load_from_cadd_file(options['cadd_file'])
        elif options['project_id']:
            print("Loading " + options['project_id'])
            project = Project.objects.get(project_id=options['project_id'])
            variant_collection = get_project_datastore(project)._get_project_collection(options['project_id']).find({'annotation.cadd_phred': {'$exists' : False}})
        else:
            variant_collection = annotator_store.variants.find({'annotation.cadd_phred': {'$exists' : False}})

        #print("Variant collection: " + str(variant_collection))
        #print("Annotating %s variants" % variant_collection.count())

        for r in tqdm.tqdm(variant_collection, unit=' variants'): #, total=variant_collection.count()):
            chrom, pos = genomeloc.get_chr_pos(r['xpos'])
            cadd_phred = fetch(chrom, pos, r['ref'], r['alt'])
            if cadd_phred is not None:
                result = annotator_store.variants.update({'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt']}, {'$set': {'annotation.cadd_phred': cadd_phred}}, upsert=False)
                assert result['updatedExisting']

        print("Done")
Ejemplo n.º 21
0
def look_up_vcf_loaded_date(vcf_path):
    vcf_record = get_annotator().get_vcf_file_from_annotator(vcf_path)
    if vcf_record is None:
        raise ValueError("Couldn't find loaded date for %s" % vcf_path)

    loaded_date = vcf_record['_id'].generation_time
    logger.info("%s data-loaded date: %s" % (vcf_path, loaded_date))
    return loaded_date
def look_up_vcf_loaded_date(vcf_path):
    vcf_record = get_annotator().get_vcf_file_from_annotator(vcf_path)
    if vcf_record is None:
        raise ValueError("Couldn't find loaded date for %s" % vcf_path)

    loaded_date = vcf_record['_id'].generation_time
    # logger.info("%s data-loaded date: %s" % (vcf_path, loaded_date))
    return loaded_date
Ejemplo n.º 23
0
    def handle(self, *args, **options):
        if not args:
            print("Must provide at least one project_id")
            return

        for project_id in args:
            print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
            project = Project.objects.get(project_id=project_id)

            for vcf_obj in project.get_all_vcf_files():
                r = vcf.VCFReader(filename=vcf_obj.path())
                if "CSQ" not in r.infos:
                    print("VCF %s isn't annotated (eg. doesn't have a CSQ)" % str(vcf_obj.path()))
                else:
                    print("Loading VCF %s with CSQ: %s" % (vcf_obj.path(), r.infos["CSQ"]))
                mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=True)

        print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
Ejemplo n.º 24
0
    def handle(self, *args, **options):
        if not args:
            print("Must provide at least one project_id")
            return

        for project_id in args:
            print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
            project = Project.objects.get(project_id=project_id)

            for vcf_obj in project.get_all_vcf_files():
                r = vcf.VCFReader(filename=vcf_obj.path())
                if "CSQ" not in r.infos:
                    print("VCF %s isn't annotated (eg. doesn't have a CSQ)" % str(vcf_obj.path()))
                else:
                    print("Loading VCF %s with CSQ: %s" % (vcf_obj.path(), r.infos["CSQ"]))
                mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=True)

        print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
Ejemplo n.º 25
0
def get_variants_from_variant_tuples(project, variant_tuples):
    variants = []
    for t in variant_tuples:
        variant = get_datastore(project.project_id).get_single_variant(
            project.project_id,
            t[3],
            t[0],
            t[1],
            t[2]
        )
        if not variant:
            variant = Variant(t[0], t[1], t[2])
            get_annotator().annotate_variant(variant, project.get_reference_population_slugs())
            
        variant.set_extra('family_id', t[3])
        variant.set_extra('project_id', project.project_id)
        variants.append(variant)
    return variants
Ejemplo n.º 26
0
    def update_pop_freqs_in_family_tables(self):
        # Load family tables
        population_frequency_store = mall.get_annotator().get_population_frequency_store()

        db = sqlite3.connect("reference_populations_family_tables.db", isolation_level=None)
        db.execute(
            "CREATE TABLE if not exists all_projects(project_id varchar(200), family_id varchar(200), started bool, finished bool)"
        )
        db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id, family_id)")
        for project in Project.objects.all().order_by("-last_accessed_date"):
            project_id = project.project_id
            datastore = get_datastore(project_id)
            for i, family_info in enumerate(datastore._get_family_info(project_id)):
                family_id = family_info["family_id"]
                db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, ?, 0, 0)", (project_id, family_id))

        # Go through each project in decending order
        population_slugs_to_load = [
            population_spec["slug"] for population_spec in annotator_settings.reference_populations_to_load
        ]
        while True:
            remaining_work = list(
                db.execute("SELECT project_id, family_id FROM all_projects WHERE started=0 ORDER BY RANDOM()")
            )
            print("%d projects / families remaining" % len(remaining_work))
            if not remaining_work:
                print("Done with all projects/families")
                break

            project_id, family_id = remaining_work[0]
            datastore = get_datastore(project_id)
            print("    updating %s / %s" % (project_id, family_id))
            db.execute("UPDATE all_projects SET started=1 WHERE project_id=? AND family_id=?", (project_id, family_id))

            family_collection = datastore._get_family_collection(project_id, family_id)

            for variant_dict in family_collection.find():
                freqs = population_frequency_store.get_frequencies(
                    variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"]
                )
                full_freqs = {
                    "db_freqs." + population_slug: freqs.get(population_slug, 0)
                    for population_slug in population_slugs_to_load
                }
                family_collection.update(
                    {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]},
                    {"$set": full_freqs},
                    upsert=False,
                )
                # print("---------\nvariant_dict: %s, \nfreqs: %s, \nupdated_variant_dict: %s" % (variant_dict, full_freqs, str(family_collection.find_one(
                #            {'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']}))))

            print("     ---> done updating project_id: %s, family_id: %s" % (project_id, family_id))
            db.execute("UPDATE all_projects SET finished=1 WHERE project_id=? AND family_id=?", (project_id, family_id))
Ejemplo n.º 27
0
def load_project_variants_from_vcf(project_id, vcf_files):
    """
    Load any families and cohorts in this project that aren't loaded already
    """
    print("Called load_project_variants_from_vcf on " + str(vcf_files))
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_file in vcf_files:
        r = vcf.VCFReader(filename=vcf_file)
        if "CSQ" in r.infos:
            mall.get_annotator().add_preannotated_vcf_file(vcf_file)
        else:
            mall.get_annotator().add_vcf_file_to_annotator(vcf_file)

    # batch load families by VCF file
    print("project.families_by_vcf(): " + str(project.families_by_vcf()))
    for vcf_file, families in project.families_by_vcf().items():
        if vcf_file not in vcf_files:
            print("Skipping %(vcf_file)s since its not in %(vcf_files)s" %
                  locals())
            continue

        #families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        print("Loading families for VCF file: " + vcf_file)
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            #print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                mark_as_loaded=True)
            print(
                date.strftime(
                    datetime.now(),
                    "%m/%d/%Y %H:%M:%S  -- finished loading project: " +
                    project_id))
Ejemplo n.º 28
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" %
                     (gene_id, gene))
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)
    num_indivs = len(
        [i for i in project.get_individuals() if i.has_variant_data()])
    aac_threshold = (.2 * num_indivs) + 5
    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter):
        aac = get_alt_allele_count(variant)
        max_af = max(variant.annotation['freqs'].values())
        if aac <= aac_threshold and max_af < .01:
            rare_variants.append(variant)

    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    knockouts = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for kid in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([kid])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        knockouts.append({
            'indiv_id': kid,
            'variants': [v.toJSON() for v in variants],
        })

    sys.stderr.write("Retrieved %s variants \n" % len(rare_variants))
    return render(
        request, 'project/gene_quicklook.html', {
            'gene':
            gene,
            'gene_json':
            json.dumps(gene),
            'project':
            project,
            'rare_variants_json':
            json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json':
            json.dumps([i.get_json_obj() for i in project.get_individuals()]),
            'knockouts_json':
            json.dumps(knockouts),
        })
Ejemplo n.º 29
0
    def update_pop_freqs_in_project_tables(self):
        # Load project tables
        population_frequency_store = mall.get_annotator().get_population_frequency_store()

        db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None)
        db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)")
        db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)")

        import random

        other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"]
        random.shuffle(other_project_ids)
        project_ids = ["myoseq_v11"] + other_project_ids
        for project_id in project_ids:
            db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,))

        # Go through each project and update the variant records
        population_slugs_to_load = [
            population_spec["slug"] for population_spec in annotator_settings.reference_populations
        ]
        while True:
            remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0"))
            print("%d projects remaining" % len(remaining_work))
            if not remaining_work:
                print("Done with all projects")
                break

            project_id, = remaining_work[0]
            project_store = get_project_datastore(project_id)

            print("    updating %s " % project_id)
            db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,))

            project_collection = project_store._get_project_collection(project_id)
            for variant_dict in project_collection.find():
                freqs = population_frequency_store.get_frequencies(
                    variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"]
                )
                full_freqs = {
                    "db_freqs." + population_slug: freqs.get(population_slug, 0)
                    for population_slug in population_slugs_to_load
                }
                project_collection.update(
                    {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]},
                    {"$set": full_freqs},
                    upsert=False,
                )

            print("     ---> done updating project_id: %s" % project_id)
            db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
Ejemplo n.º 30
0
def load_project_variants(project_id, force_annotations=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    project = Project.objects.get(project_id=project_id)

    for vcf in project.get_all_vcf_files():
        mall.get_annotator().add_vcf_file_to_annotator(vcf.path(), force_all=force_annotations)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [f for f in families if get_mall().variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    # now load cohorts
    # TODO: load cohorts and families together
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [c for c in cohorts if get_mall().variant_store.get_family_status(project_id, c.cohort_id) != 'loaded']
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    print "Finished loading project %s!" % project_id
Ejemplo n.º 31
0
    def update_pop_freqs_in_project_tables(self):
        # Load project tables
        population_frequency_store = mall.get_annotator().get_population_frequency_store()

        db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None)
        db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)")
        db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)")

        
        import random        
        other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"]
        random.shuffle(other_project_ids)
        project_ids = ["myoseq_v11"] + other_project_ids
        for project_id in project_ids:
            db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,))


        # Go through each project and update the variant records
        population_slugs_to_load = [population_spec['slug'] for population_spec in annotator_settings.reference_populations]
        while True:
            remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0"))
            print("%d projects remaining" % len(remaining_work))
            if not remaining_work:
                print("Done with all projects")
                break

            project_id, = remaining_work[0]
            project_store = get_project_datastore(project_id)


            print("    updating %s " % project_id)
            db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,))

            project_collection = project_store._get_project_collection(project_id)
            for variant_dict in project_collection.find():
                freqs = population_frequency_store.get_frequencies(variant_dict['xpos'], variant_dict['ref'], variant_dict['alt'])
                full_freqs = {'db_freqs.'+population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load}
                project_collection.update({'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']},
                                         {'$set': full_freqs},
                                         upsert=False)

            print("     ---> done updating project_id: %s" % project_id)
            db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
Ejemplo n.º 32
0
    def update_pop_freqs_in_family_tables(self):
        # Load family tables
        population_frequency_store = mall.get_annotator().get_population_frequency_store()

        db = sqlite3.connect("reference_populations_family_tables.db", isolation_level=None)
        db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), family_id varchar(200), started bool, finished bool)")
        db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id, family_id)")
        for project in Project.objects.all().order_by('-last_accessed_date'):
            project_id = project.project_id
            datastore = get_datastore(project_id)
            for i, family_info in enumerate(datastore._get_family_info(project_id)):
                family_id = family_info['family_id']
                db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, ?, 0, 0)", (project_id, family_id))

        # Go through each project in decending order
        population_slugs_to_load = [population_spec['slug'] for population_spec in annotator_settings.reference_populations_to_load]
        while True:
            remaining_work = list(db.execute("SELECT project_id, family_id FROM all_projects WHERE started=0 ORDER BY RANDOM()"))
            print("%d projects / families remaining" % len(remaining_work))
            if not remaining_work:
                print("Done with all projects/families")
                break

            project_id, family_id = remaining_work[0]
            datastore = get_datastore(project_id)
            print("    updating %s / %s" % (project_id, family_id))
            db.execute("UPDATE all_projects SET started=1 WHERE project_id=? AND family_id=?", (project_id, family_id))

            family_collection = datastore._get_family_collection(project_id, family_id)

            for variant_dict in family_collection.find():
                freqs = population_frequency_store.get_frequencies(variant_dict['xpos'], variant_dict['ref'], variant_dict['alt'])
                full_freqs = {'db_freqs.'+population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load}
                family_collection.update({'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']},
                                         {'$set': full_freqs},
                                         upsert=False)
                #print("---------\nvariant_dict: %s, \nfreqs: %s, \nupdated_variant_dict: %s" % (variant_dict, full_freqs, str(family_collection.find_one(
                #            {'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']}))))


            print("     ---> done updating project_id: %s, family_id: %s" % (project_id, family_id))
            db.execute("UPDATE all_projects SET finished=1 WHERE project_id=? AND family_id=?", (project_id, family_id))
Ejemplo n.º 33
0
    def handle(self, *args, **options):
        """load CADD scores for all variants in a project, or all variants in the annotator_store."""

        annotator_store = mall.get_annotator().get_annotator_datastore()
        if options['cadd_file']:
            print("Loading " + options['cadd_file'])
            load_from_cadd_file(options['cadd_file'])
        elif options['project_id']:
            print("Loading " + options['project_id'])
            project = Project.objects.get(project_id=options['project_id'])
            variant_collection = get_project_datastore(
                project)._get_project_collection(options['project_id']).find(
                    {'annotation.cadd_phred': {
                        '$exists': False
                    }})
        else:
            variant_collection = annotator_store.variants.find(
                {'annotation.cadd_phred': {
                    '$exists': False
                }})

        #print("Variant collection: " + str(variant_collection))
        #print("Annotating %s variants" % variant_collection.count())

        for r in tqdm.tqdm(
                variant_collection,
                unit=' variants'):  #, total=variant_collection.count()):
            chrom, pos = genomeloc.get_chr_pos(r['xpos'])
            cadd_phred = fetch(chrom, pos, r['ref'], r['alt'])
            if cadd_phred is not None:
                result = annotator_store.variants.update(
                    {
                        'xpos': r['xpos'],
                        'ref': r['ref'],
                        'alt': r['alt']
                    }, {'$set': {
                        'annotation.cadd_phred': cadd_phred
                    }},
                    upsert=False)
                assert result['updatedExisting']

        print("Done")
Ejemplo n.º 34
0
def get_knockouts_in_gene(project, gene_id, gene_variants):
    """
    Get all the variants in a gene, but filter out quality_filter genotypes
    """
    indiv_id_list = [i.indiv_id for i in project.get_individuals()]

    # filter out variants > 0.01 AF in any of the reference populations
    reference_populations = mall.get_annotator().reference_population_slugs
    variant_filter = get_default_variant_filter('moderate_impact', reference_populations)
    variant_list = search_utils.filter_gene_variants_by_variant_filter(gene_variants, gene_id, variant_filter)

    variation = CohortGeneVariation(
        get_reference(),
        gene_id,
        variant_list,
        indiv_id_list,
        quality_filter={},
    )
    knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list)
    return knockouts, variation
Ejemplo n.º 35
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene))
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)
    num_indivs = len([i for i in project.get_individuals() if i.has_variant_data()])
    aac_threshold = (.2 * num_indivs) + 5
    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter):
        aac = get_alt_allele_count(variant)
        max_af = max(variant.annotation['freqs'].values())
        if aac <= aac_threshold and max_af < .01:
            rare_variants.append(variant)

    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    knockouts = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for kid in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([kid])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        knockouts.append({
            'indiv_id': kid,
            'variants': [v.toJSON() for v in variants],
        })

    sys.stderr.write("Retrieved %s variants \n" % len(rare_variants))
    return render(request, 'project/gene_quicklook.html', {
        'gene': gene,
        'gene_json': json.dumps(gene),
        'project': project,
        'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
        'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]),
        'knockouts_json': json.dumps(knockouts),
    })
Ejemplo n.º 36
0
def family_group_gene(request, project_id, family_group_slug, gene_id):

    project = get_object_or_404(Project, project_id=project_id)
    family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug)
    if not project.can_view(request.user):
        return HttpResponse('unauthorized')

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    varfilter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)
    variants_by_family = family_group_analysis.get_variants_in_gene(family_group, gene_id, variant_filter=varfilter)

    return render(request, 'family_group/family_group_gene.html', {
        'project': project,
        'family_group': family_group,
        'family_group_json': json.dumps(family_group.toJSON()),
        'gene_json': json.dumps(gene),
        'gene': gene,
        'variants_by_family_json': json.dumps(variants_by_family),
    })
Ejemplo n.º 37
0
def default_variant_filters_json():
    filters = get_default_variant_filters(
        mall.get_annotator().reference_population_slugs)
    for item in filters:
        item['variant_filter'] = item['variant_filter'].toJSON()
    return filters
Ejemplo n.º 38
0
def default_variant_filters_json():
    filters = get_default_variant_filters(mall.get_annotator().reference_population_slugs)
    for item in filters:
        item['variant_filter'] = item['variant_filter'].toJSON()
    return filters
Ejemplo n.º 39
0
def preload_vep_vcf_annotations(vcf_file_path):
    mall.get_annotator().preload_vep_annotated_vcf(open(vcf_file_path))
Ejemplo n.º 40
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.")

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                      "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter', 
            ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id))
                for variant in get_variants(get_datastore(project.project_id),
                                            family.xfamily(),
                                            variant_filter = variant_filter,
                                            quality_filter = quality_filter,
                                            indivs_to_consider = [individual.indiv_id]
                                            ):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold


                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)

                    writer.writerow(map(str, [
                        project_id,
                        family.family_id,
                        individual.indiv_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        genotype_str,
                        genotype.num_alt,
                        genotype.ab,
                        genotype.extras["ad"],
                        genotype.extras["dp"],
                        genotype.gq,
                        genotype.extras["pl"],
                        genotype.filter,
                    ]))
                    individual_variants_f.flush()
        individual_variants_f.close()
Ejemplo n.º 41
0
 def handle(self, *args, **options):
     mall.get_annotator().load()
Ejemplo n.º 42
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")

    if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff:
        return render(request, 'analysis_unavailable.html',
                      {'reason': 'Awaiting phenotype data.'})

    # other projects this user can view
    if request.user.is_staff:
        other_projects = [p for p in Project.objects.all()]  #  if p != project
    else:
        other_projects = [
            c.project
            for c in ProjectCollaborator.objects.filter(user=request.user)
        ]  # if c.project != project

    other_projects = filter(
        lambda p: get_project_datastore(p.project_id).
        project_collection_is_loaded(p.project_id), other_projects)

    if other_projects:
        other_projects_json = json.dumps([{
            'project_id': p.project_id,
            'project_name': p.project_name
        } for p in sorted(other_projects, key=lambda p: p.project_id)])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
                'other_projects_json': other_projects_json,
            })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        projects_to_search = []
        project_ids = projects_to_search_param.split(",")
        for project_id in project_ids:
            project = get_object_or_404(Project, project_id=project_id)
            if not project.can_view(request.user):
                return HttpResponse("Unauthorized")
            projects_to_search.append(project)
    else:
        projects_to_search = [project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(
        project_id + " - staring gene search for: %s in projects: %s\n" %
        (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n"))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    for project in projects_to_search:
        project_variants = []
        for variant in project_analysis.get_variants_in_gene(
                project, gene_id, variant_filter=variant_filter):
            max_af = max(variant.annotation['freqs'].values())
            if not any([
                    indiv_id
                    for indiv_id, genotype in variant.genotypes.items()
                    if genotype.num_alt > 0
            ]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos,
                                          variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                rare_variant_dict[variant_id].genotypes.update(
                    variant.genotypes)

        #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
        add_extra_info_to_variants_project(get_reference(), project,
                                           project_variants)
        rare_variants.extend(project_variants)
    sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants))

    # compute knockout individuals
    individ_ids_and_variants = []
    for project in projects_to_search:
        knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids(
                [indiv_id])
            add_extra_info_to_variants_project(get_reference(), project,
                                               variants)
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })
            #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), ))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[
                                indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                        variant.unique_tuple(), ("", ""))

                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    if indiv_id in variant.genotypes and variant.genotypes[
                            indiv_id].num_alt > 0:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + list(
            map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i],
                individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps([
                    i.get_json_obj() for project in projects_to_search
                    for i in project.get_individuals()
                ]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
                'other_projects_json':
                other_projects_json,
            })
Ejemplo n.º 43
0
    def handle(self, *args, **options):
        if not args:
            sys.exit("ERROR: please specify project id on the command line")
        if len(args) > 1:
            sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args) )

        project_id = args[0]


        # create family_variants.tsv
        family_variants_f = gzip.open('family_variants_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            '#inheritance_mode',
            'project_id',
            'family_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            '',
            ]

        genotype_headers = [
            'sample_id',
            'str',
            'num_alt',
            'allele_balance',
            'AD',
            'DP',
            'GQ',
            'PL',
        ]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)
        family_variants_f.flush()

        for inheritance_mode in ['dominant', 'homozygous_recessive', 'compound_het', 'de_novo', 'x_linked_recessive']:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)
            families = project.get_families()

            # get the variants for this inheritance / project combination
            for i, (family, variant_list) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)):
                for variant in variant_list:
                    #if variant.annotation['vep_group'] != "missense":
                    #    continue
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq =  variant.annotation['freqs']['exac_v3_popmax']

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)


                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation['vep_group'],

                        g1k_freq,
                        g1k_popmax_freq,

                        exac_freq,
                        exac_popmax_freq,
                        '',
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)
                        if genotype is None:
                            print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id))
                            continue

                        assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                        assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                        assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                        if genotype.num_alt == 1:
                            assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)

                        genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                        row.extend([
                            individual.indiv_id,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],])

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
Ejemplo n.º 44
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")

    if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff:
        return render(request, 'analysis_unavailable.html',
                      {'reason': 'Awaiting phenotype data.'})

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
            })

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" %
                     (gene_id, gene))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter):
        max_af = max(variant.annotation['freqs'].values())
        if not any([
                indiv_id for indiv_id, genotype in variant.genotypes.items()
                if genotype.num_alt > 0
        ]):
            continue
        if max_af < .01:
            rare_variants.append(variant)
    #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    # compute knockout individuals
    individ_ids_and_variants = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for indiv_id in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        individ_ids_and_variants.append({
            'indiv_id': indiv_id,
            'variants': variants,
        })

    sys.stderr.write(
        "Project-wide gene search retrieved %s rare variants for gene: %s \n" %
        (len(rare_variants), gene_id))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        if genotype.num_alt > 0:
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                        variant.unique_tuple(), ("", ""))

                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    if genotype.num_alt > 0:
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + individuals_to_include

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps(
                    [i.get_json_obj() for i in project.get_individuals()]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
            })
Ejemplo n.º 45
0
    def handle(self, *args, **options):
        if len(args) != 2:
            sys.exit(
                "ERROR: please specify the project_id and file of individual ids as command line args."
            )

        project_id = args[0]
        individuals_file = args[1]

        # init objects
        project = Project.objects.get(project_id=project_id)
        all_individual_ids_in_project = set(
            [i.indiv_id for i in project.get_individuals()])

        individuals_of_interest = []
        invalid_individual_ids = []
        with open(individuals_file) as f:
            for line in f:
                line = line.strip('\n')
                if not line or line.startswith("#"):
                    continue
                individual_id = line.split("\t")[0]
                if individual_id in all_individual_ids_in_project:
                    individuals_of_interest.append(individual_id)
                else:
                    invalid_individual_ids.append(individual_id)

        print("Processing %s: %d individuals " %
              (project_id, len(individuals_of_interest)))
        if invalid_individual_ids:
            num_invalid = len(invalid_individual_ids)
            total_ids = len(all_individual_ids_in_project)
            sys.exit((
                "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: "
                "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s"
            ) % locals())

        # filter
        variant_filter = get_default_variant_filter('moderate_impact')
        variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
        variant_filter.ref_freqs.append(
            ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
        variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
        variant_filter.ref_freqs.append(
            ('exac_v3_popmax', exac_popmax_threshold))
        variant_filter.ref_freqs.append(
            ('merck-wgs-3793', merck_wgs_3793_threshold))
        quality_filter = {
            'vcf_filter': 'pass',
            'min_gq': GQ_threshold,
            'min_ab': AB_threshold,
        }

        # create individuals_variants.tsv
        individual_variants_f = gzip.open(
            'individuals_in_%s.tsv.gz' % project_id, 'w')
        writer = csv.writer(individual_variants_f,
                            dialect='excel',
                            delimiter='\t')

        header_fields = [
            'project_id',
            'family_id',
            'individual_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'genotype_str',
            'genotype_num_alt',
            'genotype_allele_balance',
            'genotype_AD',
            'genotype_DP',
            'genotype_GQ',
            'genotype_PL',
            'genotype_filter',
        ]

        writer.writerow(header_fields)
        # collect the resources that we'll need here
        annotator = get_annotator()
        custom_population_store = get_custom_population_store()

        individual_counter = 0
        for i, family in enumerate(project.get_families()):
            for individual in family.get_individuals():
                if individual.indiv_id not in individuals_of_interest:
                    continue
                individual_counter += 1
                print("%s: %s, individual %s" %
                      (individual_counter, family.family_id,
                       individual.indiv_id))
                for variant in get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=[individual.indiv_id]):
                    genotype = variant.get_genotype(individual.indiv_id)
                    if len(genotype.alleles) == 0 or genotype.extras[
                            "dp"] < DP_threshold or genotype.num_alt == 0:
                        continue

                    custom_populations = custom_population_store.get_frequencies(
                        variant.xpos, variant.ref, variant.alt)

                    genotype_str = "/".join(
                        genotype.alleles) if genotype.alleles else "./."

                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs'][
                        '1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq = variant.annotation['freqs'][
                        'exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get(
                        'merck-wgs-3793', 0.0)

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    assert merck_wgs_3793_freq <= merck_wgs_3793_threshold

                    assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                        variant.chr, variant.pos, genotype.gq)
                    assert genotype.extras[
                        "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr, variant.pos, genotype.extras["dp"])
                    if genotype.num_alt == 1:
                        assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                            variant.chr, variant.pos, genotype.ab)
                    assert genotype.filter == "pass", "%s %s - filter is %s " % (
                        variant.chr, variant.pos, genotype.filter)

                    writer.writerow(
                        map(str, [
                            project_id,
                            family.family_id,
                            individual.indiv_id,
                            get_gene_symbol(variant),
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id,
                            variant.annotation['vep_group'],
                            g1k_freq,
                            g1k_popmax_freq,
                            exac_freq,
                            exac_popmax_freq,
                            merck_wgs_3793_freq,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab,
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                            genotype.filter,
                        ]))
                    individual_variants_f.flush()
        individual_variants_f.close()
Ejemplo n.º 46
0
def handle_project(project_id):
        filename = 'family_variants_%s.tsv.gz' % project_id
        print("Generating report: " + filename)

        # create family_variants.tsv
        family_variants_f = gzip.open(filename, 'w')
        writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

        header_fields = [
            '#inheritance_mode',
            'project_id',
            'family_id',
            'gene',
            'chrom',
            'pos',
            'ref',
            'alt',
            'rsid',
            'filter', 
            'clinvar_status',
            'annotation',
            '1kg_af',
            '1kg_popmax_af',
            'exac_af',
            'exac_popmax_af',
            'merck_wgs_3793_af',
            'merck_wgs_144_af',
            'multiallelic_site_alt_alleles (* = spanning deletion)',
            '',
            ]

        genotype_headers = [
            'sample_id',
            'str',
            'num_alt',
            'allele_balance',
            'AD',
            'DP',
            'GQ',
            'PL',
        ]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)

        for inheritance_mode in ['homozygous_recessive', 'dominant', 'compound_het', 'de_novo', 'x_linked_recessive', 'all_variants']:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)

            # get the variants for this inheritance / project combination
            for i, (family, family_results) in enumerate(get_variants_for_inheritance_for_project(project, inheritance_mode)):
                for variant in family_results:
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                    g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax']
                    exac_freq = variant.annotation['freqs']['exac_v3']
                    exac_popmax_freq =  variant.annotation['freqs']['exac_v3_popmax']
                    merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0)
                    merck_wgs_144_freq = custom_populations.get('merck-pcr-free-wgs-144', 0.0)

                    try:
                        assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                        assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold)
                        assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                        assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold)
                        #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold)
                        #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold)
                    except AssertionError as e:
                        import traceback
                        traceback.print_exc()

                    # filter value is stored in the genotypes
                    if len(family.get_individuals()) == 0:
                        print("Family has 0 individuals: %s - skipping..." % str(family))
                        continue
                    filter_value = variant.get_genotype(family.get_individuals()[0].indiv_id).filter  

                    multiallelic_site_other_alleles = []
                    if len(variant.extras['orig_alt_alleles']) > 1:
                        multiallelic_site_other_alleles = variant.extras['orig_alt_alleles']

                    clinvar_significance = CLINVAR_VARIANTS.get(variant.unique_tuple(), [""])[-1]
                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        filter_value,
                        clinvar_significance,
                        variant.annotation['vep_group'],

                        g1k_freq,
                        g1k_popmax_freq,

                        exac_freq,
                        exac_popmax_freq,
                        merck_wgs_3793_freq,
                        merck_wgs_144_freq,
                        ", ".join(multiallelic_site_other_alleles),
                        '',
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)

                        if genotype is None:
                            row.extend([individual.indiv_id, "./.", "", "", "", "", "", ""])
                            continue
                        else:
                            #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                            try:
                                assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq)
                                assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"])
                                if genotype.num_alt == 1:
                                    assert genotype.ab is None or genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab)
                            except AssertionError as e:
                                import traceback
                                traceback.print_exc()

                            genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                            row.extend([
                                    individual.indiv_id,
                                    genotype_str,
                                    genotype.num_alt,
                                    genotype.ab if genotype.ab is not None else '',
                                    genotype.extras["ad"],
                                    genotype.extras["dp"],
                                    genotype.gq,
                                    genotype.extras["pl"],
                            ])

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
        print("Done with " + filename)
Ejemplo n.º 47
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    main_project = get_object_or_404(Project, project_id=project_id)
    if not main_project.can_view(request.user):
        return HttpResponse("Unauthorized")

    # other projects this user can view
    other_projects = get_loaded_projects_for_user(
        request.user, fields=['project_id', 'project_name'])

    if other_projects:
        other_projects_json = json.dumps([{
            'project_id': p.project_id,
            'project_name': p.project_name
        } for p in sorted(other_projects, key=lambda p: p.project_id.lower())])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(
            request, 'project/gene_quicklook.html', {
                'project': main_project,
                'gene': None,
                'gene_json': None,
                'rare_variants_json': None,
                'individuals_json': None,
                'knockouts_json': None,
                'other_projects_json': other_projects_json,
            })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        project_ids = projects_to_search_param.split(",")
        projects_to_search = [
            project for project in other_projects
            if project.project_id in project_ids
        ]
        if len(projects_to_search) < len(project_ids):
            # If not all the specified project ids are in the other projects list then they are not authorized
            return HttpResponse("Unauthorized")
    else:
        project_ids = [main_project.project_id]
        projects_to_search = [main_project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(
        project_id + " - staring gene search for: %s in projects: %s\n" %
        (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n"))

    # all rare coding variants
    variant_filter = get_default_variant_filter(
        'all_coding',
        mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    individ_ids_and_variants = []
    for project in projects_to_search:
        all_project_variants = project_analysis.get_variants_in_gene(
            project, gene_id, variant_filter=variant_filter)

        # compute knockout individuals
        knockout_ids, variation = get_knockouts_in_gene(
            project, gene_id, all_project_variants)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids(
                [indiv_id])
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })

        # compute rare variants
        project_variants = []
        for i, variant in enumerate(all_project_variants):
            max_af = max([
                freq for label, freq in variant.annotation['freqs'].items()
                if label != "AF"
            ])  # don't filter on within-cohort AF

            if not any([
                    indiv_id
                    for indiv_id, genotype in variant.genotypes.items()
                    if genotype.num_alt > 0
            ]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos,
                                          variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                rare_variant_dict[variant_id].genotypes.update(
                    variant.genotypes)

        rare_variants.extend(project_variants)

    all_variants = sum([i['variants'] for i in individ_ids_and_variants],
                       rare_variants)
    add_extra_info_to_variants_project(get_reference(), project, all_variants)
    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(
                download_csv,
                gene.get("symbol") or gene.get("transcript_name"))

        if download_csv == 'knockouts':

            individuals_to_include = [
                individ_id_and_variants["indiv_id"]
                for individ_id_and_variants in individ_ids_and_variants
            ]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][
                        worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[
                                indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" +
                                             str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = get_reference(
                    ).get_clinvar_info(*variant.unique_tuple())
                    rows.append(
                        map(str, [
                            gene["symbol"],
                            variant.chr,
                            variant.pos,
                            variant.ref,
                            variant.alt,
                            variant.vcf_id or "",
                            variant.annotation.get("vep_consequence", ""),
                            worst_annotation.get("hgvsc", ""),
                            worst_annotation.get("hgvsp", "").replace(
                                "%3D", "="),
                            worst_annotation.get("sift", ""),
                            worst_annotation.get("polyphen", ""),
                            worst_annotation.get("mutationtaster_pred", ""),
                            ";".join(
                                set(
                                    worst_annotation.get("fathmm_pred",
                                                         "").split('%3B'))),
                            measureset_id,
                            clinvar_significance,
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3", ""),
                            variant.annotation["freqs"].get(
                                "1kg_wgs_phase3_popmax", ""),
                            variant.annotation["freqs"].get("exac_v3", ""),
                            variant.annotation["freqs"].get(
                                "exac_v3_popmax", ""),
                            all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    if indiv_id in variant.genotypes and variant.genotypes[
                            indiv_id].num_alt > 0:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        genotypes.append(allele_string + "   (" +
                                         str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = get_reference(
                ).get_clinvar_info(*variant.unique_tuple())
                rows.append(
                    map(str, [
                        gene["symbol"],
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", ""),
                        worst_annotation.get("hgvsp", "").replace("%3D", "="),
                        worst_annotation.get("sift", ""),
                        worst_annotation.get("polyphen", ""),
                        worst_annotation.get("mutationtaster_pred", ""),
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B'))),
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        all_genotypes_string,
                    ] + genotypes))

        header = [
            "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c",
            "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id",
            "clinvar_clinical_sig", "freq_1kg_wgs_phase3",
            "freq_1kg_wgs_phase3_popmax", "freq_exac_v3",
            "freq_exac_v3_popmax", "all_genotypes"
        ] + list(
            map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i],
                individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [
                v.toJSON() for v in variants
            ]

        individ_ids = {i['indiv_id'] for i in individ_ids_and_variants}
        for var in rare_variants:
            individ_ids.update(var.genotypes.keys())
        individuals = Individual.objects.filter(
            indiv_id__in=individ_ids,
            project__project_id__in=project_ids).select_related(
                'project').select_related('family').only(
                    'project__project_id', 'family__family_id',
                    *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS)

        return render(
            request, 'project/gene_quicklook.html', {
                'gene':
                gene,
                'gene_json':
                json.dumps(gene),
                'project':
                main_project,
                'rare_variants_json':
                json.dumps([v.toJSON() for v in rare_variants]),
                'individuals_json':
                json.dumps([
                    i.get_json_obj(skip_has_variant_data=True)
                    for i in individuals
                ]),
                'knockouts_json':
                json.dumps(individ_ids_and_variants),
                'other_projects_json':
                other_projects_json,
            })
Ejemplo n.º 48
0
    def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01):
        '''
        Search for a gene across project(s)
        Args:
          1. search_gene_id: Gene ID to search for
          2. proj_list: An optional list of projects to narrow down search to
      '''
        gene_id = get_gene_id_from_str(search_gene_id, get_reference())
        gene = get_reference().get_gene(gene_id)

        print("Staring gene search for: %s %s in projects: %s\n" %
              (search_gene_id, gene['gene_id'], ", ".join(project_id_list)))
        print("Max AF threshold: %s" % max_af)

        # all rare coding variants
        variant_filter = get_default_variant_filter(
            'all_coding',
            mall.get_annotator().reference_population_slugs)
        print("All Filters: ")
        pprint(variant_filter.toJSON())

        output_filename = 'results_' + search_gene_id + '.tsv'
        outfile = open(output_filename, 'w')

        header = [
            "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter",
            "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster",
            "fathmm", "clinvar_id", "clinvar_clinical_sig",
            "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
            "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"
        ]

        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(header)

        if project_id_list:
            for project_id in project_id_list:
                project = Project.objects.filter(
                    project_id=project_id)[0]  # TODO validate
        else:
            project_id_list = [p.project_id for p in Project.objects.all()]

        for project_id in project_id_list:
            project = Project.objects.filter(project_id=project_id)[0]
            if get_project_datastore(project_id).project_collection_is_loaded(
                    project_id):
                print("Running on project %s" % project_id)
            else:
                print(
                    "Skipping project %s - gene search is not enabled for this project"
                    % project_id)
                continue

            for variant in project_analysis.get_variants_in_gene(
                    project, gene_id, variant_filter=variant_filter):
                if max(variant.annotation['freqs'].values()) >= max_af:
                    continue
                #pprint(variant.toJSON())
                add_extra_info_to_variants_project(get_reference(), project,
                                                   [variant])

                worst_annotation_idx = variant.annotation[
                    "worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][
                    worst_annotation_idx]
                all_genotypes_list = []
                pass_filter = "N/A"
                for indiv_id, genotype in variant.genotypes.items():
                    pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                    if genotype.num_alt > 0:
                        all_genotypes_list.append(
                            "%s[gt:%s GQ:%s AB:%0.3f]" %
                            (indiv_id, ">".join(
                                genotype.alleles), genotype.gq, genotype.ab
                             if genotype.ab is not None else float('NaN')))

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(
                    variant.unique_tuple(), ("", ""))
                row = map(str, [
                    project_id,
                    gene["symbol"],
                    variant.chr,
                    variant.pos,
                    variant.ref,
                    variant.alt,
                    variant.vcf_id or "",
                    pass_filter,
                    variant.annotation.get("vep_consequence", ""),
                    worst_annotation.get("hgvsc", ""),
                    worst_annotation.get("hgvsp", "").replace("%3D", "="),
                    worst_annotation.get("sift", ""),
                    worst_annotation.get("polyphen", ""),
                    worst_annotation.get("mutationtaster_pred", ""),
                    ";".join(
                        set(
                            worst_annotation.get("fathmm_pred",
                                                 "").split('%3B'))),
                    measureset_id,
                    clinvar_significance,
                    variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                    variant.annotation["freqs"].get("1kg_wgs_phase3_popmax",
                                                    ""),
                    variant.annotation["freqs"].get("exac_v3", ""),
                    variant.annotation["freqs"].get("exac_v3_popmax", ""),
                    ", ".join(all_genotypes_list),
                ])
                writer.writerow(row)

        outfile.close()
        print("Wrote out %s" % output_filename)
Ejemplo n.º 49
0
def add_populations_to_variants(variants, population_slug_list):
    if population_slug_list:
        try:
            mall.get_annotator().get_population_frequency_store().add_populations_to_variants(variants, population_slug_list)
        except Exception, e:
            print("WARNING: got unexpected error in add_custom_populations_to_variants: %s" % e)
Ejemplo n.º 50
0
def preload_vep_vcf_annotations(vcf_file_path):
    mall.get_annotator().preload_vep_annotated_vcf(open(vcf_file_path))
Ejemplo n.º 51
0
 def load_population_frequency_store(self):
     population_frequency_store = mall.get_annotator().get_population_frequency_store()
     for population_spec in annotator_settings.reference_populations_to_load:
         print("Loading " + str(population_spec))
         population_frequency_store.load_population(population_spec)
Ejemplo n.º 52
0
 def load_population_frequency_store(self):
     population_frequency_store = mall.get_annotator().get_population_frequency_store()
     for population_spec in annotator_settings.reference_populations_to_load:
         print("Loading " + str(population_spec))
         population_frequency_store.load_population(population_spec)
Ejemplo n.º 53
0
def handle_project(project_id):
    filename = 'family_variants_%s.tsv.gz' % project_id
    print("Generating report: " + filename)

    # create family_variants.tsv
    family_variants_f = gzip.open(filename, 'w')
    writer = csv.writer(family_variants_f, dialect='excel', delimiter='\t')

    header_fields = [
        '#inheritance_mode',
        'project_id',
        'family_id',
        'gene',
        'chrom',
        'pos',
        'ref',
        'alt',
        'rsid',
        'filter',
        'clinvar_status',
        'annotation',
        '1kg_af',
        '1kg_popmax_af',
        'exac_af',
        'exac_popmax_af',
        'merck_wgs_3793_af',
        'merck_wgs_144_af',
        'multiallelic_site_alt_alleles (* = spanning deletion)',
        '',
    ]

    genotype_headers = [
        'sample_id',
        'str',
        'num_alt',
        'allele_balance',
        'AD',
        'DP',
        'GQ',
        'PL',
    ]

    for i in range(0, 10):
        for h in genotype_headers:
            header_fields.append("genotype%d_%s" % (i, h))

    writer.writerow(header_fields)

    for inheritance_mode in [
            'homozygous_recessive', 'dominant', 'compound_het', 'de_novo',
            'x_linked_recessive', 'all_variants'
    ]:
        # collect the resources that we'll need here
        annotator = mall.get_annotator()
        custom_population_store = mall.get_custom_population_store()

        project = Project.objects.get(project_id=project_id)

        # get the variants for this inheritance / project combination
        for i, (family, family_results) in enumerate(
                get_variants_for_inheritance_for_project(
                    project, inheritance_mode)):
            for variant in family_results:
                custom_populations = custom_population_store.get_frequencies(
                    variant.xpos, variant.ref, variant.alt)
                g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3']
                g1k_popmax_freq = variant.annotation['freqs'][
                    '1kg_wgs_phase3_popmax']
                exac_freq = variant.annotation['freqs']['exac_v3']
                exac_popmax_freq = variant.annotation['freqs'][
                    'exac_v3_popmax']
                merck_wgs_3793_freq = custom_populations.get(
                    'merck-wgs-3793', 0.0)
                merck_wgs_144_freq = custom_populations.get(
                    'merck-pcr-free-wgs-144', 0.0)

                try:
                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (
                        g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (
                        g1k_popmax_freq, g1k_popmax_freq_threshold)
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (
                        exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq, exac_popmax_threshold)
                    #assert merck_wgs_3793_freq <= merck_wgs_3793_threshold, "Merck WGS 3793 threshold %s > %s" % (merck_wgs_3793_freq, merck_wgs_3793_threshold)
                    #assert merck_wgs_144_freq <= merck_wgs_144_threshold, "Merck PCR free 144 threshold %s > %s" % (merck_wgs_144_freq, merck_wgs_144_threshold)
                except AssertionError as e:
                    import traceback
                    traceback.print_exc()

                # filter value is stored in the genotypes
                if len(family.get_individuals()) == 0:
                    print("Family has 0 individuals: %s - skipping..." %
                          str(family))
                    continue

                genotype = variant.get_genotype(
                    family.get_individuals()[0].indiv_id)
                if genotype is not None:
                    filter_value = genotype.filter
                else:
                    filter_value = 'unknown'

                multiallelic_site_other_alleles = []
                if len(variant.extras['orig_alt_alleles']) > 1:
                    multiallelic_site_other_alleles = variant.extras[
                        'orig_alt_alleles']

                clinvar_significance = get_clinvar_variants().get(
                    variant.unique_tuple(), [""])[-1]
                row = [
                    inheritance_mode,
                    project_id,
                    family.family_id,
                    get_gene_symbol(variant),
                    variant.chr,
                    str(variant.pos),
                    variant.ref,
                    variant.alt,
                    variant.vcf_id,
                    filter_value,
                    clinvar_significance,
                    variant.annotation['vep_group'],
                    g1k_freq,
                    g1k_popmax_freq,
                    exac_freq,
                    exac_popmax_freq,
                    merck_wgs_3793_freq,
                    merck_wgs_144_freq,
                    ", ".join(multiallelic_site_other_alleles),
                    '',
                ]

                for i, individual in enumerate(family.get_individuals()):
                    if i >= 10:
                        break

                    genotype = variant.get_genotype(individual.indiv_id)

                    if genotype is None:
                        row.extend([
                            individual.indiv_id, "./.", "", "", "", "", "", ""
                        ])
                        continue
                    else:
                        #assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter)
                        try:
                            assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                                variant.chr, variant.pos, genotype.gq)
                            assert genotype.extras[
                                "dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                                    variant.chr, variant.pos,
                                    genotype.extras["dp"])
                            if genotype.num_alt == 1:
                                assert genotype.ab is None or genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % (
                                    variant.chr, variant.pos, genotype.ab)
                        except AssertionError as e:
                            import traceback
                            traceback.print_exc()

                        genotype_str = "/".join(
                            genotype.alleles) if genotype.alleles else "./."

                        row.extend([
                            individual.indiv_id,
                            genotype_str,
                            genotype.num_alt,
                            genotype.ab if genotype.ab is not None else '',
                            genotype.extras["ad"],
                            genotype.extras["dp"],
                            genotype.gq,
                            genotype.extras["pl"],
                        ])

                writer.writerow(row)
                family_variants_f.flush()

    family_variants_f.close()
    print("Done with " + filename)
Ejemplo n.º 54
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    main_project = get_object_or_404(Project, project_id=project_id)
    if not main_project.can_view(request.user):
        return HttpResponse("Unauthorized")

    new_page_url = '/variant_search/project/{}'.format(main_project.seqr_project.guid) if main_project.seqr_project and main_project.seqr_project.has_new_search else None

    # other projects this user can view
    other_projects = get_loaded_projects_for_user(request.user, fields=['project_id', 'project_name'])

    if other_projects:
        other_projects_json = json.dumps([{'project_id': p.project_id, 'project_name': p.project_name} for p in sorted(other_projects, key=lambda p: p.project_id.lower())])
    else:
        other_projects_json = None

    if gene_id is None:
        return render(request, 'project/gene_quicklook.html', {
            'project': main_project,
            'gene': None,
            'gene_json': None,
            'rare_variants_json': None,
            'individuals_json': None,
            'knockouts_json': None,
            'other_projects_json': other_projects_json,
            'new_page_url': new_page_url,
        })

    projects_to_search_param = request.GET.get('selected_projects')
    if projects_to_search_param:
        project_ids = projects_to_search_param.split(",")
        projects_to_search = [project for project in other_projects if project.project_id in project_ids]
        if len(projects_to_search) < len(project_ids):
            # If not all the specified project ids are in the other projects list then they are not authorized
            return HttpResponse("Unauthorized")
    else:
        project_ids = [main_project.project_id]
        projects_to_search = [main_project]

    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)

    # all rare coding variants
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)

    indiv_id_to_project_id = {}
    rare_variant_dict = {}
    rare_variants = []
    individ_ids_and_variants = []
    for project in projects_to_search:
        all_project_variants = project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter)

        # compute knockout individuals
        knockout_ids, variation = get_knockouts_in_gene(project, gene_id, all_project_variants)
        for indiv_id in knockout_ids:
            variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
            individ_ids_and_variants.append({
                'indiv_id': indiv_id,
                'variants': variants,
            })

        # compute rare variants
        project_variants = []
        for i, variant in enumerate(all_project_variants):
            max_af = max([freq for label, freq in variant.annotation['freqs'].items() if label != "AF"])  # don't filter on within-cohort AF

            if not any([indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0]):
                continue
            if max_af >= .01:
                continue

            # add project id to genotypes
            for indiv_id in variant.genotypes:
                indiv_id_to_project_id[indiv_id] = project.project_id

            # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project)
            variant_id = "%s-%s-%s-%s" % (variant.chr,variant.pos, variant.ref, variant.alt)
            if variant_id not in rare_variant_dict:
                rare_variant_dict[variant_id] = variant
                project_variants.append(variant)
            else:
                for indiv_id, genotype in variant.genotypes.items():
                    existing_genotype = rare_variant_dict[variant_id].genotypes.get(indiv_id)
                    if not existing_genotype or existing_genotype.num_alt == -1:
                        rare_variant_dict[variant_id].genotypes[indiv_id] = genotype
        if project != main_project:
            add_extra_info_to_variants_project(get_reference(), project, project_variants)
        rare_variants.extend(project_variants)

    all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants)
    add_extra_info_to_variants_project(get_reference(), main_project, all_variants, add_family_tags=True)
    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene.get("symbol") or gene.get("transcript_name"))

        def get_row(variant, worst_annotation):
            if 'clinvar_allele_id' in variant.extras:
                measureset_id = variant.extras['clinvar_allele_id']
                clinvar_significance = variant.extras['clinvar_clinsig']
            else:
                measureset_id, clinvar_significance = get_reference().get_clinvar_info(*variant.unique_tuple())
            genotypes = []

            all_genotypes_string = ""
            for indiv_id in individuals_to_include:
                if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                else:
                    genotypes.append("")
            return [
                gene["symbol"],
                variant.chr,
                variant.pos,
                variant.ref,
                variant.alt,
                variant.vcf_id or variant.annotation.get("rsid") or "",
                variant.annotation.get("vep_consequence") or "",
                worst_annotation.get("hgvsc") or "",
                (worst_annotation.get("hgvsp") or "").replace("%3D", "="),
                variant.annotation.get("sift") or "",
                variant.annotation.get("polyphen") or "",
                variant.annotation.get("mutationtaster_pred") or variant.annotation.get("muttaster") or "",
                (";".join(set((worst_annotation.get("fathmm_pred") or "").split('%3B')))) or variant.annotation.get("fathmm") or "",

                measureset_id or "",
                clinvar_significance or "",

                variant.annotation["freqs"].get("1kg_wgs_phase3") or variant.annotation["freqs"].get("1kg_wgs_AF") or "",
                variant.annotation["freqs"].get("1kg_wgs_phase3_popmax") or variant.annotation["freqs"].get("1kg_wgs_popmax_AF") or "",
                variant.annotation["freqs"].get("exac_v3") or variant.annotation["freqs"].get("exac_v3_AF") or "",
                variant.annotation["freqs"].get("exac_v3_popmax") or variant.annotation["freqs"].get("exac_v3_popmax_AF") or "",
                variant.annotation["freqs"].get("gnomad_exomes_AF") or "",
                variant.annotation["freqs"].get("gnomad_exomes_popmax_AF") or "",
                variant.annotation["freqs"].get("gnomad_genomes_AF") or "",
                variant.annotation["freqs"].get("gnomad_genomes_popmax_AF") or "",
                all_genotypes_string,
            ] + genotypes

        if download_csv == 'knockouts':

            individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0:
                            genotype = variant.genotypes[indiv_id]
                            allele_string = ">".join(genotype.alleles)
                            all_genotypes_string += indiv_id + ":" + allele_string + "  "
                            genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    rows.append(map(str, get_row(variant, worst_annotation)))

        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]

                rows.append(map(str, get_row(variant, worst_annotation)))

        header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact",
                  "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig",
                  "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
                  "freq_exac_v3", "freq_exac_v3_popmax",
                  "freq_gnomad_exomes", "freq_gnomad_exomes_popmax",
                  "freq_gnomad_genomes", "freq_gnomad_genomes_popmax",
                  "all_genotypes"] + list(map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include))

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [v.toJSON() for v in variants]

        individ_ids = {i['indiv_id'] for i in individ_ids_and_variants}
        for var in rare_variants:
            individ_ids.update(var.genotypes.keys())
        individuals = Individual.objects.filter(
            indiv_id__in=individ_ids, project__project_id__in=project_ids
        ).select_related('project').select_related('family').only('project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS)

        return render(request, 'project/gene_quicklook.html', {
            'gene': gene,
            'gene_json': json.dumps(gene),
            'project': main_project,
            'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json': json.dumps([i.get_json_obj(skip_has_variant_data=True) for i in individuals]),
            'knockouts_json': json.dumps(individ_ids_and_variants),
            'other_projects_json': other_projects_json,
            'new_page_url': new_page_url,
        })
Ejemplo n.º 55
0
def gene_quicklook(request, project_id, gene_id):
    """
    Summary of a gene in a project
    """
    project = get_object_or_404(Project, project_id=project_id)
    if not project.can_view(request.user):
        return HttpResponse("Unauthorized")
    
    if gene_id is None:
        return render(request, 'project/gene_quicklook.html', {
            'project': project,
            'gene': None,
            'gene_json': None,
            'rare_variants_json': None,
            'individuals_json': None,
            'knockouts_json': None,
        })
        
        
    gene_id = get_gene_id_from_str(gene_id, get_reference())
    gene = get_reference().get_gene(gene_id)
    sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene))

    # all rare coding variants
    variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs)

    rare_variants = []
    for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter):
        max_af = max(variant.annotation['freqs'].values())
        if max_af < .01:
            rare_variants.append(variant)
    #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation']))
    add_extra_info_to_variants_project(get_reference(), project, rare_variants)

    # compute knockout individuals
    individ_ids_and_variants = []
    knockout_ids, variation = get_knockouts_in_gene(project, gene_id)
    for indiv_id in knockout_ids:
        variants = variation.get_relevant_variants_for_indiv_ids([indiv_id])
        add_extra_info_to_variants_project(get_reference(), project, variants)
        individ_ids_and_variants.append({
            'indiv_id': indiv_id,
            'variants': variants,
        })

    sys.stderr.write("Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id))

    download_csv = request.GET.get('download', '')
    if download_csv:
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene["transcript_name"])

        if download_csv == 'knockouts':

            individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants]

            rows = []
            for individ_id_and_variants in individ_ids_and_variants:
                rare_variants = individ_id_and_variants["variants"]
                for variant in rare_variants:
                    worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                    worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                    genotypes = []
                    all_genotypes_string = ""
                    for indiv_id in individuals_to_include:
                        genotype = variant.genotypes[indiv_id]
                        allele_string = ">".join(genotype.alleles)
                        all_genotypes_string += indiv_id + ":" + allele_string + "  "
                        if genotype.num_alt > 0:
                            genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                        else:
                            genotypes.append("")

                    measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", ""))

                    rows.append(map(str,
                        [ gene["symbol"],
                          variant.chr,
                          variant.pos,
                          variant.ref,
                          variant.alt,
                          variant.vcf_id or "",
                          variant.annotation.get("vep_consequence", ""),
                          worst_annotation.get("hgvsc", ""),
                          worst_annotation.get("hgvsp", "").replace("%3D", "="),
                          worst_annotation.get("sift", ""),
                          worst_annotation.get("polyphen", ""),
                          worst_annotation.get("mutationtaster_pred", ""),
                          ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))),

                          measureset_id,
                          clinvar_significance,

                          variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                          variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""),
                          variant.annotation["freqs"].get("exac_v3", ""),
                          variant.annotation["freqs"].get("exac_v3_popmax", ""),
                          all_genotypes_string,
                        ] + genotypes))
        elif download_csv == 'rare_variants':
            individuals_to_include = []
            for variant in rare_variants:
                for indiv_id, genotype in variant.genotypes.items():
                    if genotype.num_alt > 0 and indiv_id not in individuals_to_include:
                        individuals_to_include.append(indiv_id)
            rows = []
            for variant in rare_variants:
                worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id]
                worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx]
                genotypes = []
                all_genotypes_string = ""
                for indiv_id in individuals_to_include:
                    genotype = variant.genotypes[indiv_id]
                    allele_string = ">".join(genotype.alleles)
                    all_genotypes_string += indiv_id + ":" + allele_string + "  "
                    if genotype.num_alt > 0:
                        genotypes.append(allele_string + "   (" + str(genotype.gq) + ")")
                    else:
                        genotypes.append("")

                measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", ""))
                rows.append(map(str,
                    [ gene["symbol"],
                      variant.chr,
                      variant.pos,
                      variant.ref,
                      variant.alt,
                      variant.vcf_id or "",
                      variant.annotation.get("vep_consequence", ""),
                      worst_annotation.get("hgvsc", ""),
                      worst_annotation.get("hgvsp", "").replace("%3D", "="),
                      worst_annotation.get("sift", ""),
                      worst_annotation.get("polyphen", ""),
                      worst_annotation.get("mutationtaster_pred", ""),
                      ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))),
                      measureset_id,
                      clinvar_significance,
                      variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                      variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""),
                      variant.annotation["freqs"].get("exac_v3", ""),
                      variant.annotation["freqs"].get("exac_v3_popmax", ""),
                      all_genotypes_string,
                    ] + genotypes))


        header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact",
                  "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig",
                  "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
                  "freq_exac_v3", "freq_exac_v3_popmax",
                  "all_genotypes"] + individuals_to_include

        writer = csv.writer(response)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)
        return response
    else:
        for individ_id_and_variants in individ_ids_and_variants:
            variants = individ_id_and_variants["variants"]
            individ_id_and_variants["variants"] = [v.toJSON() for v in variants]

        return render(request, 'project/gene_quicklook.html', {
            'gene': gene,
            'gene_json': json.dumps(gene),
            'project': project,
            'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]),
            'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]),
            'knockouts_json': json.dumps(individ_ids_and_variants),
        })
Ejemplo n.º 56
0
    def search_for_genes(self,
                         gene_or_variant_ids,
                         project_id_list,
                         output_filename,
                         max_af=0.01,
                         knockouts=False,
                         in_clinvar_only=False,
                         include_non_coding=False):
        """
        Search for a gene across project(s)

        Args:
            gene_or_variant_ids (list): 'ENSG..' gene id strings.
            project_id_list (list): (optional) project ids to narrow down the search
            output_filename (string): output file name
            max_af (float): AF filter
            in_clinvar_only (bool):
            include_non_coding (bool):
        """

        projects = [
            Project.objects.get(project_id=project_id)
            for project_id in project_id_list
        ]

        outfile = open(output_filename, 'w')

        header = [
            "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter",
            "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster",
            "fathmm", "clinvar_id", "clinvar_clinical_sig",
            "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax",
            "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes",
            "gnomad-genomes", "families", "all_genotypes"
        ]

        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(header)

        # all rare coding variants
        if not knockouts:
            variant_filter = get_default_variant_filter(
                'all_coding',
                mall.get_annotator().reference_population_slugs)
            #variant_filter.set_max_AF(max_af)
            if include_non_coding:
                variant_filter.so_annotations = []
            print("All Filters: ")
            pprint(variant_filter.toJSON())

        #print("Max AF threshold: %s" % max_af)
        print("Starting search for:\n%s\nin projects:\n%s\n" %
              (", ".join(gene_or_variant_ids), ", ".join(
                  [p.project_id for p in projects])))

        for project in projects:
            project_id = project.project_id
            if get_project_datastore(project).project_collection_is_loaded(
                    project):
                print("=====================")
                print("Searching project %s" % project_id)
            else:
                print(
                    "Skipping project %s - gene search is not enabled for this project"
                    % project_id)
                continue

            indiv_cache = {}
            for gene_or_variant_id in gene_or_variant_ids:
                chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})",
                                           gene_or_variant_id)
                chrom_pos_ref_alt_match = re.match(
                    "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)",
                    gene_or_variant_id)

                if chrom_pos_match or chrom_pos_ref_alt_match:
                    chrom = chrom_pos_match.group(1)
                    pos = int(chrom_pos_match.group(2))
                    xpos = genomeloc.get_xpos(chrom, pos)
                    ref = alt = None
                    if chrom_pos_ref_alt_match:
                        ref = chrom_pos_ref_alt_match.group(3)
                        alt = chrom_pos_ref_alt_match.group(4)

                    variant = get_project_datastore(
                        project).get_single_variant(project.project_id, None,
                                                    xpos, ref, alt)
                    if variant is None:
                        continue
                    variants = [variant]
                    print("-- searching %s for variant %s-%s-%s: found %s" %
                          (project_id, xpos, ref, alt, variant))
                    worst_annotation_idx = variant.annotation[
                        'worst_vep_annotation_index']
                    print(variant.annotation["vep_annotation"]
                          [worst_annotation_idx])
                    gene_id = variant.annotation["vep_annotation"][
                        worst_annotation_idx]['gene_id']
                    gene = get_reference().get_gene(gene_id)
                else:
                    gene_id = get_gene_id_from_str(gene_or_variant_id,
                                                   get_reference())
                    gene = get_reference().get_gene(gene_id)
                    print("-- searching %s for gene %s (%s)" %
                          (project_id, gene["symbol"], gene_id))

                    if knockouts:
                        knockout_ids, variation = project_analysis.get_knockouts_in_gene(
                            project, gene_id)
                        variants = variation.get_relevant_variants_for_indiv_ids(
                            knockout_ids)
                    else:
                        variants = project_analysis.get_variants_in_gene(
                            project, gene_id, variant_filter=variant_filter)

                for variant in variants:
                    if not chrom_pos_match and not chrom_pos_ref_alt_match and max(
                            variant.annotation['freqs'].values()) >= max_af:
                        continue

                    add_extra_info_to_variants_project(get_reference(),
                                                       project, [variant])
                    worst_annotation_idx = variant.annotation[
                        "worst_vep_index_per_gene"].get(gene_id)

                    if worst_annotation_idx is not None:
                        worst_annotation = variant.annotation[
                            "vep_annotation"][worst_annotation_idx]
                    else:
                        worst_annotation = None
                    all_genotypes_list = []
                    pass_filter = "N/A"
                    family_ids = set()
                    for indiv_id, genotype in variant.genotypes.items():
                        if indiv_id in indiv_cache:
                            individual = indiv_cache[indiv_id]
                            if individual == 'deleted':
                                continue
                        else:
                            try:
                                individual = Individual.objects.get(
                                    project=project, indiv_id=indiv_id)
                                indiv_cache[indiv_id] = individual
                            except ObjectDoesNotExist:
                                # this can happen when an individual is deleted from the project - from postgres, but not from mong
                                indiv_cache[indiv_id] = 'deleted'
                                continue
                            except MultipleObjectsReturned:
                                # when several families have an individual with the same id
                                individuals = Individual.objects.filter(
                                    project=project, indiv_id=indiv_id)
                                individual = individuals[0]
                                indiv_cache[indiv_id] = individual

                        pass_filter = genotype.filter  # filter value is stored in the genotypes even though it's the same for all individuals
                        if genotype.num_alt > 0:
                            family_ids.add(individual.family.family_id)
                            all_genotypes_list.append(
                                "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" %
                                (individual.family.family_id, indiv_id,
                                 "[Affected]" if individual.affected == "A"
                                 else ("[-]" if individual.affected == "N" else
                                       "[?]"), ">".join(genotype.alleles),
                                 genotype.gq, genotype.ab
                                 if genotype.ab is not None else float('NaN')))

                    if len(all_genotypes_list) == 0:
                        continue

                    measureset_id, clinvar_significance = get_reference(
                    ).get_clinvar_info(*variant.unique_tuple())
                    if in_clinvar_only and (
                            not clinvar_significance
                            or "path" not in clinvar_significance.lower()):
                        continue

                    row = map(str, [
                        project_id,
                        gene,
                        variant.chr,
                        variant.pos,
                        variant.ref,
                        variant.alt,
                        variant.vcf_id or "",
                        pass_filter,
                        variant.annotation.get("vep_consequence", ""),
                        worst_annotation.get("hgvsc", "")
                        if worst_annotation else "",
                        (worst_annotation.get("hgvsp", "") or "").replace(
                            "%3D", "=") if worst_annotation else "",
                        worst_annotation.get("sift", "")
                        if worst_annotation else "",
                        worst_annotation.get("polyphen", "")
                        if worst_annotation else "",
                        worst_annotation.get("mutationtaster_pred", "")
                        if worst_annotation else "",
                        ";".join(
                            set(
                                worst_annotation.get("fathmm_pred",
                                                     "").split('%3B')))
                        if worst_annotation else "",
                        measureset_id,
                        clinvar_significance,
                        variant.annotation["freqs"].get("1kg_wgs_phase3", ""),
                        variant.annotation["freqs"].get(
                            "1kg_wgs_phase3_popmax", ""),
                        variant.annotation["freqs"].get("exac_v3", ""),
                        variant.annotation["freqs"].get("exac_v3_popmax", ""),
                        variant.annotation["freqs"].get("gnomad-exomes2", ""),
                        variant.annotation["freqs"].get("gnomad-genomes2", ""),
                        ", ".join(sorted(list(family_ids))),
                        ", ".join(all_genotypes_list),
                    ])

                    writer.writerow(row)

        outfile.close()
        print("Wrote out %s" % output_filename)
Ejemplo n.º 57
0
    def handle(self, *args, **options):
        if not args:
            sys.exit("ERROR: please specify project id on the command line")
        if len(args) > 1:
            sys.exit("ERROR: too many args: %s. Only one project id should be provided." % " ".join(args))

        project_id = args[0]

        # create family_variants.tsv
        family_variants_f = gzip.open("family_variants_%s.tsv.gz" % project_id, "w")
        writer = csv.writer(family_variants_f, dialect="excel", delimiter="\t")

        header_fields = [
            "#inheritance_mode",
            "project_id",
            "family_id",
            "gene",
            "chrom",
            "pos",
            "ref",
            "alt",
            "rsid",
            "annotation",
            "1kg_af",
            "1kg_popmax_af",
            "exac_af",
            "exac_popmax_af",
            "",
        ]

        genotype_headers = ["sample_id", "str", "num_alt", "allele_balance", "AD", "DP", "GQ", "PL"]

        for i in range(0, 10):
            for h in genotype_headers:
                header_fields.append("genotype%d_%s" % (i, h))

        writer.writerow(header_fields)
        family_variants_f.flush()

        for inheritance_mode in ["dominant", "homozygous_recessive", "compound_het", "de_novo", "x_linked_recessive"]:
            # collect the resources that we'll need here
            annotator = mall.get_annotator()
            custom_population_store = mall.get_custom_population_store()

            project = Project.objects.get(project_id=project_id)
            families = project.get_families()

            # get the variants for this inheritance / project combination
            for i, (family, variant_list) in enumerate(
                get_variants_for_inheritance_for_project(project, inheritance_mode)
            ):
                for variant in variant_list:
                    # if variant.annotation['vep_group'] != "missense":
                    #    continue
                    custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt)
                    g1k_freq = variant.annotation["freqs"]["1kg_wgs_phase3"]
                    g1k_popmax_freq = variant.annotation["freqs"]["1kg_wgs_phase3_popmax"]
                    exac_freq = variant.annotation["freqs"]["exac_v3"]
                    exac_popmax_freq = variant.annotation["freqs"]["exac_v3_popmax"]

                    assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold)
                    assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k freq %s > %s" % (
                        g1k_popmax_freq,
                        g1k_popmax_freq_threshold,
                    )
                    assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold)
                    assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (
                        exac_popmax_freq,
                        exac_popmax_threshold,
                    )

                    row = [
                        inheritance_mode,
                        project_id,
                        family.family_id,
                        get_gene_symbol(variant),
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.vcf_id,
                        variant.annotation["vep_group"],
                        g1k_freq,
                        g1k_popmax_freq,
                        exac_freq,
                        exac_popmax_freq,
                        "",
                    ]

                    for i, individual in enumerate(family.get_individuals()):
                        if i >= 10:
                            break

                        genotype = variant.get_genotype(individual.indiv_id)
                        if genotype is None:
                            print("WARNING: %s variant genotype for %s is None" % (variant, individual.indiv_id))
                            continue

                        assert genotype.filter == "pass", "%s %s - filter is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.filter,
                        )
                        assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.gq,
                        )
                        assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (
                            variant.chr,
                            variant.pos,
                            genotype.extras["dp"],
                        )
                        if genotype.num_alt == 1:
                            assert genotype.ab >= AB_threshold / 100.0, "%s %s - AB is %s " % (
                                variant.chr,
                                variant.pos,
                                genotype.ab,
                            )

                        genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./."

                        row.extend(
                            [
                                individual.indiv_id,
                                genotype_str,
                                genotype.num_alt,
                                genotype.ab,
                                genotype.extras["ad"],
                                genotype.extras["dp"],
                                genotype.gq,
                                genotype.extras["pl"],
                            ]
                        )

                    writer.writerow(row)
                    family_variants_f.flush()

        family_variants_f.close()
Ejemplo n.º 58
0
def load_project_variants(project_id,
                          force_annotations=False,
                          ignore_csq_in_vcf=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    os.system("du /mongo/mongodb")
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in project.get_all_vcf_files():
        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" in r.infos:
            mall.get_annotator().add_preannotated_vcf_file(
                vcf_obj.path(), force=force_annotations)
        else:
            mall.get_annotator().add_vcf_file_to_annotator(
                vcf_obj.path(), force_all=force_annotations)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [
            f for f in families
            if get_mall(project.project_id).variant_store.get_family_status(
                project_id, f.family_id) != 'loaded'
        ]
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(
                date.strftime(
                    datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
                    project_id + " - families batch %d - %d families" %
                    (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file)

    # now load cohorts
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - cohorts"))
    # TODO: load cohorts and families together
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - cohorts"))
    os.system("du /mongo/mongodb")
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [
            c for c in cohorts
            if get_mall(project.project_id).variant_store.get_family_status(
                project_id, c.cohort_id) != 'loaded'
        ]
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" %
                  (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(
                project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])

    print(
        date.strftime(
            datetime.now(),
            "%m/%d/%Y %H:%M:%S  -- finished loading project: " + project_id))