Exemple #1
0
    def handle(self, *args, **options):
        """load CADD scores for all variants in a project, or all variants in the annotator_store."""

        annotator_store = mall.get_annotator().get_annotator_datastore()
        if options['cadd_file']:
            print("Loading " + options['cadd_file'])
            load_from_cadd_file(options['cadd_file'])
        elif options['project_id']:
            print("Loading " + options['project_id'])
            project = Project.objects.get(project_id=options['project_id'])
            variant_collection = get_project_datastore(project)._get_project_collection(options['project_id']).find({'annotation.cadd_phred': {'$exists' : False}})
        else:
            variant_collection = annotator_store.variants.find({'annotation.cadd_phred': {'$exists' : False}})

        #print("Variant collection: " + str(variant_collection))
        #print("Annotating %s variants" % variant_collection.count())

        for r in tqdm.tqdm(variant_collection, unit=' variants'): #, total=variant_collection.count()):
            chrom, pos = genomeloc.get_chr_pos(r['xpos'])
            cadd_phred = fetch(chrom, pos, r['ref'], r['alt'])
            if cadd_phred is not None:
                result = annotator_store.variants.update({'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt']}, {'$set': {'annotation.cadd_phred': cadd_phred}}, upsert=False)
                assert result['updatedExisting']

        print("Done")
Exemple #2
0
def get_x_linked_variants(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Variants that follow x linked inheritance in a family
    """
    x_linked_filter = inheritance.get_x_linked_filter(family)
    for variant in get_variants(datastore, family, genotype_filter=x_linked_filter, variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list()):
        if genomeloc.get_chr_pos(variant.xpos)[0] == 'chrX':
            yield variant
Exemple #3
0
def get_x_linked_variants(datastore, reference, family, variant_filter=None, quality_filter=None):
    """
    Variants that follow x linked inheritance in a family
    """
    x_linked_filter = inheritance.get_x_linked_filter(family)
    for variant in get_variants(datastore, family, genotype_filter=x_linked_filter, variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list()):
        if genomeloc.get_chr_pos(variant.xpos)[0] == 'chrX':
            yield variant
    def handle(self, *args, **options):
        number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(variant.xpos)
                                chrom = chrom.replace("chr","")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print("---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals())

                                for v in not_found_variants:
                                    print("http://exac.broadinstitute.org/variant/" + v)
                            break
Exemple #5
0
def calculate_combine_mendelian_families(family_group, search_spec, user=None):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xfamilygroup = family_group.xfamilygroup()

    genes = []
    for gene_id, family_id_list in get_families_by_gene(
            get_mall(family_group.project),
            xfamilygroup,
            search_spec.inheritance_mode,
            search_spec.variant_filter,
            search_spec.quality_filter,
            user=user,
    ):

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        gene = {
            'gene_info': xgene,
            'gene_id': gene_id,
            'gene_name': xgene['symbol'],
            'chr': chr,
            'start': start,
            'end': end,
            'family_id_list': family_id_list,
        }

        genes.append(gene)

    return genes
Exemple #6
0
def calculate_combine_mendelian_families(family_group, search_spec, user=None):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xfamilygroup = family_group.xfamilygroup()

    genes = []
    for gene_id, family_id_list in get_families_by_gene(
        get_mall(family_group.project),
        xfamilygroup,
        search_spec.inheritance_mode,
        search_spec.variant_filter,
        search_spec.quality_filter,
        user=user,
    ):

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        gene = {
            'gene_info': xgene,
            'gene_id': gene_id,
            'gene_name': xgene['symbol'],
            'chr': chr,
            'start': start,
            'end': end,
            'family_id_list': family_id_list,
        }

        genes.append(gene)

    return genes
Exemple #7
0
def get_recessive_individuals(gene_variation, indiv_id_list):
    """
    An individual is recessive if they have *any* homozyogus recessvie, x-linked, or compound het recessive inheritance
    """
    list_of_lists = [
        get_homozygous_recessive_individuals(gene_variation, indiv_id_list),
        get_compound_het_individuals(gene_variation, indiv_id_list),
    ]
    if genomeloc.get_chr_pos(gene_variation.get_gene_bounds()[0])[0] == 'chrX':
        list_of_lists.append(get_x_linked_recessive_individuals(gene_variation, indiv_id_list))
    return set([indiv_id for indiv_list in list_of_lists for indiv_id in indiv_list])
Exemple #8
0
def get_recessive_individuals(gene_variation, indiv_id_list):
    """
    An individual is recessive if they have *any* homozyogus recessvie, x-linked, or compound het recessive inheritance
    """
    list_of_lists = [
        get_homozygous_recessive_individuals(gene_variation, indiv_id_list),
        get_compound_het_individuals(gene_variation, indiv_id_list),
    ]
    if genomeloc.get_chr_pos(gene_variation.get_gene_bounds()[0])[0] == 'chrX':
        list_of_lists.append(get_x_linked_recessive_individuals(gene_variation, indiv_id_list))
    return set([indiv_id for indiv_list in list_of_lists for indiv_id in indiv_list])
Exemple #9
0
def write_map(filename, snp_panel):
    """
    Writes a MAP file to filename, with the SNPs in snp_panel
    Note that current implementation does not consider genetic distance, may want to fix that.
    """
    f = open(filename, 'w')
    for snp in snp_panel:
        chr, pos = genomeloc.get_chr_pos(snp['pos'])
        fields = [chr[3:], str(snp['pos']), '0', str(pos)]
        f.write('\t'.join(fields) + '\n')
    f.close()
Exemple #10
0
def write_map(filename, snp_panel): 
    """
    Writes a MAP file to filename, with the SNPs in snp_panel
    Note that current implementation does not consider genetic distance, may want to fix that.
    """
    f = open(filename, 'w')
    for snp in snp_panel: 
        chr, pos = genomeloc.get_chr_pos(snp['pos'])
        fields = [chr[3:], str(snp['pos']), '0', str(pos)]
        f.write('\t'.join(fields) + '\n')
    f.close()
Exemple #11
0
def write_sites_vcf(f, sites_list):
    """
    Write a sites VCF file to file_path
    Args:
        sites_list: iterator of (xpos, ref, alt) tuples
    Returns:
        True or False, if successful
    """
    f.write("##fileformat=VCFv4.0\n")
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    for site in sites_list:
        chrom, pos = genomeloc.get_chr_pos(site[0])
        fields = [chrom[3:], str(pos), '.', site[1], site[2], '.', '.', '.']
        f.write('\t'.join(fields) + '\n')
    return True
Exemple #12
0
def write_sites_vcf(f, sites_list):
    """
    Write a sites VCF file to file_path
    Args:
        sites_list: iterator of (xpos, ref, alt) tuples
    Returns:
        True or False, if successful
    """
    f.write("##fileformat=VCFv4.0\n")
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    for site in sites_list:
        chrom, pos = genomeloc.get_chr_pos(site[0])
        fields = [chrom[3:], str(pos), '.', site[1], site[2], '.', '.', '.']
        f.write('\t'.join(fields) + '\n')
    return True
Exemple #13
0
    def handle(self, *args, **options):
        """load CADD scores for all variants in a project, or all variants in the annotator_store."""

        annotator_store = mall.get_annotator().get_annotator_datastore()
        if options['cadd_file']:
            print("Loading " + options['cadd_file'])
            load_from_cadd_file(options['cadd_file'])
        elif options['project_id']:
            print("Loading " + options['project_id'])
            project = Project.objects.get(project_id=options['project_id'])
            variant_collection = get_project_datastore(
                project)._get_project_collection(options['project_id']).find(
                    {'annotation.cadd_phred': {
                        '$exists': False
                    }})
        else:
            variant_collection = annotator_store.variants.find(
                {'annotation.cadd_phred': {
                    '$exists': False
                }})

        #print("Variant collection: " + str(variant_collection))
        #print("Annotating %s variants" % variant_collection.count())

        for r in tqdm.tqdm(
                variant_collection,
                unit=' variants'):  #, total=variant_collection.count()):
            chrom, pos = genomeloc.get_chr_pos(r['xpos'])
            cadd_phred = fetch(chrom, pos, r['ref'], r['alt'])
            if cadd_phred is not None:
                result = annotator_store.variants.update(
                    {
                        'xpos': r['xpos'],
                        'ref': r['ref'],
                        'alt': r['alt']
                    }, {'$set': {
                        'annotation.cadd_phred': cadd_phred
                    }},
                    upsert=False)
                assert result['updatedExisting']

        print("Done")
Exemple #14
0
def calculate_cohort_gene_search(cohort, search_spec):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xcohort = cohort.xcohort()
    cohort_size = len(xcohort.individuals)
    indiv_id_list = xcohort.indiv_id_list()

    genes = []
    for gene_id, indivs_with_inheritance, gene_variation in cohort_get_genes_with_inheritance(
            get_datastore(cohort.project.project_id),
            get_reference(),
            xcohort,
            search_spec.inheritance_mode,
            search_spec.variant_filter,
            search_spec.genotype_quality_filter,
    ):

        num_hits = len(indivs_with_inheritance)

        # don't return genes with a single variant
        if num_hits < 2:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        control_cohort = cohort.project.default_control_cohort if cohort.project.default_control_cohort else settings.DEFAULT_CONTROL_COHORT
        control_comparison = population_controls.control_comparison(
            control_cohort, gene_id, num_hits, cohort_size,
            search_spec.inheritance_mode, search_spec.variant_filter,
            search_spec.genotype_quality_filter)

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        sys.stderr.write(
            "     cohort_gene_search - found gene: %s, gene_id: %s \n" % (
                xgene['symbol'],
                gene_id,
            ))
        gene = {
            'gene_info':
            xgene,
            'gene_id':
            gene_id,
            'gene_name':
            xgene['symbol'],
            'num_hits':
            num_hits,
            'num_unique_variants':
            len(
                gene_variation.get_relevant_variants_for_indiv_ids(
                    indiv_id_list)),
            'chr':
            chr,
            'start':
            start,
            'end':
            end,
            'control_comparison':
            control_comparison,
        }

        genes.append(gene)
    sys.stderr.write(
        "     cohort_gene_search - finished. (cohort_genes_with_inheritance iterator)"
    )
    return genes
Exemple #15
0
def generate_rows(project, errors):
    rows = []

    loaded_datasets = list(
        Dataset.objects.filter(project=project,
                               analysis_type="VARIANTS",
                               is_loaded=True))
    if not loaded_datasets:
        errors.append("No data loaded for project: %s" % project)
        logger.info("No data loaded for project: %s" % project)
        return []

    for d in loaded_datasets:
        print("Loaded time %s: %s" % (d, d.loaded_date))

    #project_variant_tag_filter = Q(family__project=project) & (
    #            Q(variant_tag_type__name__icontains="tier 1") |
    #            Q(variant_tag_type__name__icontains="tier 2") |
    #            Q(variant_tag_type__name__icontains="known gene for phenotype"))

    #project_variant_tags = list(VariantTag.objects.select_related('variant_tag_type').filter(project_variant_tag_filter))
    #project_variant_tag_names = [vt.variant_tag_type.name.lower() for vt in project_variant_tags]
    #project_has_tier1 = any([vt_name.startswith("tier 1") for vt_name in project_variant_tag_names])
    #project_has_tier2 = any([vt_name.startswith("tier 2") for vt_name in project_variant_tag_names])
    #project_has_known_gene_for_phenotype = any([(vt_name == "known gene for phenotype") for vt_name in project_variant_tag_names])

    #"External" = REAN
    #"RNA" = RNA
    #"WGS" or "Genome" . = WGS
    #else  "WES"
    lower_case_project_id = project.deprecated_project_id.lower()
    if "external" in lower_case_project_id or "reprocessed" in lower_case_project_id:
        sequencing_approach = "REAN"
    elif "rna" in lower_case_project_id:
        sequencing_approach = "RNA"
    elif "wgs" in lower_case_project_id or "genome" in lower_case_project_id:
        sequencing_approach = "WGS"
    else:
        sequencing_approach = "WES"

    now = timezone.now()
    for family in Family.objects.filter(project=project):
        individuals = list(Individual.objects.filter(family=family))
        samples = list(Sample.objects.filter(individual__family=family))

        phenotips_individual_data_records = [
            json.loads(i.phenotips_data) for i in individuals
            if i.phenotips_data
        ]

        phenotips_individual_features = [
            phenotips_data.get("features", [])
            for phenotips_data in phenotips_individual_data_records
        ]
        phenotips_individual_mim_disorders = [
            phenotips_data.get("disorders", [])
            for phenotips_data in phenotips_individual_data_records
        ]
        phenotips_individual_expected_inheritance_model = [
            inheritance_mode["label"]
            for phenotips_data in phenotips_individual_data_records
            for inheritance_mode in phenotips_data.get(
                "global_mode_of_inheritance", [])
        ]

        omim_ids = [
            disorder.get("id")
            for disorders in phenotips_individual_mim_disorders
            for disorder in disorders if "id" in disorder
        ]
        omim_number_initial = omim_ids[0].replace("MIM:",
                                                  "") if omim_ids else ""

        if omim_number_initial:
            if omim_number_initial in PHENOTYPIC_SERIES_CACHE:
                omim_number_initial = PHENOTYPIC_SERIES_CACHE[
                    omim_number_initial]
            else:
                try:
                    response = requests.get(
                        'https://www.omim.org/entry/' + omim_number_initial,
                        headers={
                            'Host': 'www.omim.org',
                            'Connection': 'keep-alive',
                            'User-Agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
                            'Upgrade-Insecure-Requests': '1',
                            'Accept':
                            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                            'Accept-Encoding': 'gzip, deflate, br',
                            'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
                        })

                    if not response.ok:
                        raise ValueError("omim request failed: %s %s" %
                                         (response, response.reason))
                    omim_page_html = response.content

                    # <a href="/phenotypicSeries/PS613280" class="btn btn-info" role="button"> Phenotypic Series </a>
                    match = re.search("/phenotypicSeries/([a-zA-Z0-9]+)",
                                      omim_page_html)
                    if not match:
                        logger.info(
                            "No phenotypic series found for OMIM initial # %s"
                            % omim_number_initial)
                        PHENOTYPIC_SERIES_CACHE[
                            omim_number_initial] = omim_number_initial
                    else:
                        phenotypic_series_id = match.group(1)
                        logger.info(
                            "Will replace OMIM initial # %s with phenotypic series %s"
                            % (omim_number_initial, phenotypic_series_id))
                        PHENOTYPIC_SERIES_CACHE[
                            omim_number_initial] = phenotypic_series_id
                        omim_number_initial = PHENOTYPIC_SERIES_CACHE[
                            omim_number_initial]
                except Exception as e:
                    # don't change omim_number_initial
                    logger.info(
                        "Unable to look up phenotypic series for OMIM initial number: %s. %s"
                        % (omim_number_initial, e))

        submitted_to_mme = any([
            individual.mme_submitted_data for individual in individuals
            if individual.mme_submitted_data
        ])

        #samples
        #print([s for s in samples])
        #print([(dataset, dataset.is_loaded, dataset.loaded_date) for sample in samples for dataset in sample.dataset_set.all()])

        datesets_loaded_date_for_family = [
            dataset.loaded_date for sample in samples
            for dataset in sample.dataset_set.filter(analysis_type="VARIANTS")
            if dataset.loaded_date is not None
        ]
        if not datesets_loaded_date_for_family:
            errors.append("No data loaded for family: %s. Skipping..." %
                          family)
            continue

        t0 = min(datesets_loaded_date_for_family)

        t0_diff = rdelta.relativedelta(now, t0)
        t0_months_since_t0 = t0_diff.years * 12 + t0_diff.months

        analysis_complete_status = "first_pass_in_progress"
        if t0_months_since_t0 >= 12:  # or (project_has_tier1 or project_has_tier2 or project_has_known_gene_for_phenotype):
            analysis_complete_status = "complete"

        row = {
            "extras_pedigree_url":
            family.pedigree_image.url if family.pedigree_image else "",
            "project_id":
            project.deprecated_project_id,
            "project_name":
            project.name,
            "t0":
            t0,
            "months_since_t0":
            t0_months_since_t0,
            "family_id":
            family.family_id,
            "coded_phenotype":
            family.coded_phenotype or
            "",  # "Coded Phenotype" field - Ben will add a field that only staff can edit.  Will be on the family page, above short description.
            "sequencing_approach":
            sequencing_approach,  # WES, WGS, RNA, REAN, GENO - Ben will do this using a script based off project name - may need to backfill some
            "sample_source":
            "CMG",  # CMG, NHLBI-X01, NHLBI-nonX01, NEI - Most are CMG so default to them all being CMG.
            "n_kindreds":
            "1",
            "actual_inheritance_model":
            "",
            "expected_inheritance_model":
            "".join(set(phenotips_individual_expected_inheritance_model)) if
            len(set(phenotips_individual_expected_inheritance_model)) == 1 else
            "multiple",  # example: 20161205_044436_852786_MAN_0851_05_1 -  AR-homozygote, AR, AD, de novo, X-linked, UPD, other, multiple  - phenotips - Global mode of inheritance:
            "omim_number_initial":
            omim_number_initial or "NA",
            "omim_number_post_discovery":
            family.post_discovery_omim_number or "NA",
            "collaborator":
            project.name,  # TODO use email addresses?
            "analysis_summary":
            family.analysis_summary.strip('" \n'),
            "phenotype_class":
            "Known" if omim_number_initial else
            "New",  # "disorders"  UE, NEW, MULTI, EXPAN, KNOWN - If there is a MIM number enter "Known" - otherwise put "New"  and then we will need to edit manually for the other possible values
            "solved":
            "N",  # TIER 1 GENE (or known gene for phenotype also record as TIER 1 GENE), TIER 2 GENE, N - Pull from seqr using tags
            "submitted_to_mme":
            "Y" if submitted_to_mme else "NS",
            "pubmed_ids":
            "",
            "posted_publicly":
            "NS",
            "gene_name":
            "NS",
            "gene_count":
            "NA",
            "novel_mendelian_gene":
            "NS",
            "analysis_complete_status":
            analysis_complete_status,  # If known gene for phenotype, tier 1 or tier 2 tag is used on any variant  in project, or 1 year past t0 = complete.  If less than a year and none of the tags above = first pass in progress
            "genome_wide_linkage":
            "NS",
            "p_value":
            "NS",
            "n_kindreds_overlapping_sv_similar_phenotype":
            "NS",
            "n_unrelated_kindreds_with_causal_variants_in_gene":
            "NS",
            "biochemical_function":
            "NS",
            "protein_interaction":
            "NS",
            "expression":
            "NS",
            "patient_cells":
            "NS",
            "non_patient_cell_model":
            "NS",
            "animal_model":
            "NS",
            "non_human_cell_culture_model":
            "NS",
            "rescue":
            "NS",
        }

        #for hpo_category_id, hpo_category_name in HPO_CATEGORY_NAMES.items():
        #    row[hpo_category_name.lower().replace(" ", "_").replace("/", "_")] = "N"

        for hpo_category_name in [
                "connective_tissue",
                "voice",
                "nervous_system",
                "breast",
                "eye_defects",
                "prenatal_development_or_birth",
                "neoplasm",
                "endocrine_system",
                "head_or_neck",
                "immune_system",
                "growth",
                "limbs",
                "thoracic_cavity",
                "blood",
                "musculature",
                "cardiovascular_system",
                "abdomen",
                "skeletal_system",
                "respiratory",
                "ear_defects",
                "metabolism_homeostasis",
                "genitourinary_system",
                "integument",
        ]:
            row[hpo_category_name] = "N"

        category_not_set_on_some_features = False
        for features_list in phenotips_individual_features:
            for feature in features_list:
                if "category" not in feature:
                    category_not_set_on_some_features = True
                    continue

                if feature["observed"].lower() == "yes":
                    hpo_category_id = feature["category"]
                    hpo_category_name = HPO_CATEGORY_NAMES[hpo_category_id]
                    key = hpo_category_name.lower().replace(" ", "_").replace(
                        "/", "_")

                    row[key] = "Y"
                elif feature["observed"].lower() == "no":
                    continue
                else:
                    raise ValueError("Unexpected value for 'observed' in %s" %
                                     (feature, ))

        if category_not_set_on_some_features:
            errors.append(
                "HPO category field not set for some HPO terms in %s" % family)

        variant_tag_filter = Q(family=family) & (
            Q(variant_tag_type__name__icontains="tier 1")
            | Q(variant_tag_type__name__icontains="tier 2")
            | Q(variant_tag_type__name__icontains="known gene for phenotype"))

        variant_tags = list(
            VariantTag.objects.select_related('variant_tag_type').filter(
                variant_tag_filter))
        if not variant_tags:
            rows.append(row)
            continue

        gene_ids_to_variant_tags = defaultdict(list)
        for vt in variant_tags:

            if not vt.saved_variant_json:
                errors.append("%s - variant annotation not found" % vt)
                rows.append(row)
                continue

            vt.saved_variant_json = json.loads(vt.saved_variant_json)

            if "coding_gene_ids" not in vt.saved_variant_json[
                    "annotation"] and "gene_ids" not in vt.saved_variant_json[
                        "annotation"]:
                errors.append("%s - no gene_ids" % vt)
                rows.append(row)
                continue

            gene_ids = vt.saved_variant_json["annotation"].get(
                "coding_gene_ids", [])
            if not gene_ids:
                gene_ids = vt.saved_variant_json["annotation"].get(
                    "gene_ids", [])

            if not gene_ids:
                errors.append("%s - gene_ids not specified" % vt)
                rows.append(row)
                continue

            # get the shortest gene_id
            gene_id = list(sorted(gene_ids,
                                  key=lambda gene_id: len(gene_id)))[0]

            gene_ids_to_variant_tags[gene_id].append(vt)

        for gene_id, variant_tags in gene_ids_to_variant_tags.items():
            gene_symbol = get_reference().get_gene_symbol(gene_id)

            lower_case_variant_tag_type_names = [
                vt.variant_tag_type.name.lower() for vt in variant_tags
            ]
            has_tier1 = any(
                name.startswith("tier 1")
                for name in lower_case_variant_tag_type_names)
            has_tier2 = any(
                name.startswith("tier 2")
                for name in lower_case_variant_tag_type_names)
            has_known_gene_for_phenotype = any(
                name == "known gene for phenotype"
                for name in lower_case_variant_tag_type_names)

            has_tier1_phenotype_expansion_or_novel_mode_of_inheritance = any(
                name.startswith("tier 1") and
                ('expansion' in name.lower() or 'novel mode' in name.lower())
                for name in lower_case_variant_tag_type_names)
            has_tier_1_or_2_phenotype_not_delineated = any(
                (name.startswith("tier 1") or name.startswith("tier 2")) and (
                    'not delineated' in name.lower())
                for name in lower_case_variant_tag_type_names)

            analysis_complete_status = row["analysis_complete_status"]
            if has_tier1 or has_tier2 or has_known_gene_for_phenotype:
                analysis_complete_status = "complete"

            variant_tag_list = [
                ("%s  %s  %s" % ("-".join(
                    map(
                        str,
                        list(genomeloc.get_chr_pos(vt.xpos_start)) +
                        [vt.ref, vt.alt])), gene_symbol,
                                 vt.variant_tag_type.name.lower()))
                for vt in variant_tags
            ]

            actual_inheritance_models = set()
            potential_compound_hets = defaultdict(
                int)  # gene_id to compound_hets counter
            for vt in variant_tags:
                affected_indivs_with_hom_alt_variants = set()
                affected_indivs_with_het_variants = set()
                affected_total_individuals = 0
                unaffected_indivs_with_hom_alt_variants = set()
                unaffected_indivs_with_het_variants = set()
                unaffected_total_individuals = 0
                is_x_linked = False
                if vt.saved_variant_json["genotypes"]:
                    chrom, pos = genomeloc.get_chr_pos(vt.xpos_start)
                    is_x_linked = "X" in chrom
                    for indiv_id, genotype in json.loads(
                            vt.saved_variant_json["genotypes"]).items():
                        try:
                            i = Individual.objects.get(family=family,
                                                       individual_id=indiv_id)
                        except ObjectDoesNotExist as e:
                            logger.warn(
                                "WARNING: Couldn't find individual: %s, %s" %
                                (family, indiv_id))
                            continue

                        if i.affected == "A":
                            affected_total_individuals += 1
                        elif i.affected == "N":
                            unaffected_total_individuals += 1

                        if genotype["num_alt"] == 2 and i.affected == "A":
                            affected_indivs_with_hom_alt_variants.add(indiv_id)
                        elif genotype["num_alt"] == 1 and i.affected == "A":
                            affected_indivs_with_het_variants.add(indiv_id)
                        elif genotype["num_alt"] == 2 and i.affected == "N":
                            unaffected_indivs_with_hom_alt_variants.add(
                                indiv_id)
                        elif genotype["num_alt"] == 1 and i.affected == "N":
                            unaffected_indivs_with_het_variants.add(indiv_id)

                # AR-homozygote, AR-comphet, AR, AD, de novo, X-linked, UPD, other, multiple
                if not unaffected_indivs_with_hom_alt_variants and affected_indivs_with_hom_alt_variants:
                    if "AR-comphet" not in actual_inheritance_models:
                        if is_x_linked:
                            actual_inheritance_models.add("X-linked")
                        else:
                            actual_inheritance_models.add("AR-homozygote")

                if not unaffected_indivs_with_hom_alt_variants and not unaffected_indivs_with_het_variants and affected_indivs_with_het_variants:
                    if "AR-comphet" not in actual_inheritance_models:
                        if unaffected_total_individuals > 0:
                            actual_inheritance_models.add("de novo")
                        else:
                            actual_inheritance_models.add("AD")

                if not unaffected_indivs_with_hom_alt_variants and (
                        unaffected_total_individuals < 2
                        or unaffected_indivs_with_het_variants
                ) and affected_indivs_with_het_variants and not affected_indivs_with_hom_alt_variants:
                    potential_compound_hets[gene_id] += 1
                    print("%s incremented compound het for %s to %s" %
                          (vt, gene_id, potential_compound_hets[gene_id]))
                    if potential_compound_hets[gene_id] >= 2:
                        actual_inheritance_models.clear()
                        actual_inheritance_models.add("AR-comphet")

            actual_inheritance_model = " (%d aff hom, %d aff het, %d unaff hom, %d unaff het) " % (
                #affected_total_individuals,
                #unaffected_total_individuals,
                len(affected_indivs_with_hom_alt_variants),
                len(affected_indivs_with_het_variants),
                len(unaffected_indivs_with_hom_alt_variants),
                len(unaffected_indivs_with_het_variants),
            )

            actual_inheritance_model = ", ".join(
                actual_inheritance_models)  #+ actual_inheritance_model
            NA_or_KPG_or_NS = "NA" if has_tier1 or has_tier2 else (
                "KPG" if has_known_gene_for_phenotype else "NS")
            KPG_or_blank_or_NS = "KPG" if has_known_gene_for_phenotype else (
                "" if has_tier1 or has_tier2 else "NS")

            # "disorders"  UE, NEW, MULTI, EXPAN, KNOWN - If there is a MIM number enter "Known" - otherwise put "New"  and then we will need to edit manually for the other possible values
            phenotype_class = "EXPAN" if has_tier1_phenotype_expansion_or_novel_mode_of_inheritance else (
                "UE" if has_tier_1_or_2_phenotype_not_delineated else
                ("Known" if omim_number_initial else "New"))

            # create a copy of the row dict
            row = dict(row)

            row.update({
                "extras_variant_tag_list":
                variant_tag_list,
                "extras_num_variant_tags":
                len(variant_tags),
                "gene_name":
                str(gene_symbol) if gene_symbol and
                (has_tier1 or has_tier2 or has_known_gene_for_phenotype) else
                "NS",
                "gene_count":
                len(gene_ids_to_variant_tags.keys())
                if len(gene_ids_to_variant_tags.keys()) > 1 else "NA",
                "novel_mendelian_gene":
                "Y" if any("novel gene" in name
                           for name in lower_case_variant_tag_type_names) else
                ("N" if has_tier1 or has_tier2 or has_known_gene_for_phenotype
                 else "NS"),
                "solved": ("TIER 1 GENE" if
                           (has_tier1 or has_known_gene_for_phenotype) else
                           ("TIER 2 GENE" if has_tier2 else "N")),
                "posted_publicly": ("" if has_tier1 or has_tier2
                                    or has_known_gene_for_phenotype else "NS"),
                "submitted_to_mme":
                "Y" if submitted_to_mme else
                ("TBD" if has_tier1 or has_tier2 else
                 ("KPG" if has_known_gene_for_phenotype else "NS")),
                "actual_inheritance_model":
                actual_inheritance_model,
                "analysis_complete_status":
                analysis_complete_status,  # If known gene for phenotype, tier 1 or tier 2 tag is used on any variant  in project, or 1 year past t0 = complete.  If less than a year and none of the tags above = first pass in progress
                "genome_wide_linkage":
                NA_or_KPG_or_NS,
                "p_value":
                NA_or_KPG_or_NS,
                "n_kindreds_overlapping_sv_similar_phenotype":
                NA_or_KPG_or_NS,
                "n_unrelated_kindreds_with_causal_variants_in_gene":
                "1" if has_tier1 or has_tier2 else
                ("KPG" if has_known_gene_for_phenotype else "NS"),
                "biochemical_function":
                KPG_or_blank_or_NS,
                "protein_interaction":
                KPG_or_blank_or_NS,
                "expression":
                KPG_or_blank_or_NS,
                "patient_cells":
                KPG_or_blank_or_NS,
                "non_patient_cell_model":
                KPG_or_blank_or_NS,
                "animal_model":
                KPG_or_blank_or_NS,
                "non_human_cell_culture_model":
                KPG_or_blank_or_NS,
                "rescue":
                KPG_or_blank_or_NS,
                "phenotype_class":
                phenotype_class,
            })

            rows.append(row)

    return rows
Exemple #16
0
def calculate_cohort_gene_search(cohort, search_spec):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xcohort = cohort.xcohort()
    cohort_size = len(xcohort.individuals)
    indiv_id_list = xcohort.indiv_id_list()

    genes = []
    for gene_id, indivs_with_inheritance, gene_variation in cohort_get_genes_with_inheritance(
        get_datastore(cohort.project.project_id),
        get_reference(),
        xcohort,
        search_spec.inheritance_mode,
        search_spec.variant_filter,
        search_spec.quality_filter,
    ):

        num_hits = len(indivs_with_inheritance)

        # don't return genes with a single variant
        if num_hits < 2:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        control_cohort = cohort.project.default_control_cohort if cohort.project.default_control_cohort else settings.DEFAULT_CONTROL_COHORT
        control_comparison = population_controls.control_comparison(
            control_cohort,
            gene_id,
            num_hits,
            cohort_size,
            search_spec.inheritance_mode,
            search_spec.variant_filter,
            search_spec.quality_filter
        )

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        sys.stderr.write("     cohort_gene_search - found gene: %s, gene_id: %s \n" % (xgene['symbol'], gene_id, ))
        gene = {
            'gene_info': xgene,
            'gene_id': gene_id,
            'gene_name': xgene['symbol'],
            'num_hits': num_hits,
            'num_unique_variants': len(gene_variation.get_relevant_variants_for_indiv_ids(indiv_id_list)),
            'chr': chr,
            'start': start,
            'end': end,
            'control_comparison': control_comparison,
        }

        genes.append(gene)
    sys.stderr.write("     cohort_gene_search - finished. (cohort_genes_with_inheritance iterator)")
    return genes
Exemple #17
0
    def get_output_row(self,
                       variant,
                       xpos,
                       ref,
                       alt,
                       individual_id,
                       family,
                       all_fields=False,
                       comments="",
                       gene_id=""):
        v = variant
        if individual_id not in v.genotypes:
            print("skipping variant: %s because individual %s not in %s" %
                  (str(xpos) + " " + ref + ">" + alt, individual_id,
                   family.family_id))
            return None

        gene_id = gene_id.split(
            "."
        )[0] if gene_id else None  # strip off the gene_id suffix (eg. '.3')

        genotype = v.genotypes[individual_id]
        if genotype.gq is None:
            print(
                "skipping variant: %s because this variant is not called in this individual (%s)"
                % (str(xpos) + " " + ref + ">" + alt,
                   individual_id))  #, str(genotype)))
            return None

        chrom, pos = genomeloc.get_chr_pos(xpos)
        chrom_without_chr = chrom.replace("chr", "")

        annot = v.annotation
        if gene_id:
            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
                annot["vep_annotation"], gene_id=gene_id)
        else:
            # create dictionary that maps gene id to the index of the worst vep annotation for that gene
            protein_coding_gene_ids = set(a['gene']
                                          for a in annot["vep_annotation"]
                                          if a['biotype'] == 'protein_coding')
            if not protein_coding_gene_ids:
                print(
                    "skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s"
                    %
                    (str(xpos) + " " + ref + ">" + alt, individual_id, annot))
                return None

            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
                annot["vep_annotation"], gene_id=protein_coding_gene_ids)
            if len(protein_coding_gene_ids) > 1:
                selected_gene_id = annot["vep_annotation"][
                    worst_vep_annotation_index]['gene']
                print("Selected %s from %s" %
                      (annot["vep_annotation"][worst_vep_annotation_index]
                       ['symbol'],
                       set([
                           a['symbol'] for a in annot["vep_annotation"]
                           if a['gene'] in protein_coding_gene_ids
                       ])))

        vep = annot["vep_annotation"][
            worst_vep_annotation_index]  # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene

        worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(
            annot["vep_annotation"])

        vep = annot["vep_annotation"][worst_vep_annotation_index]

        if "symbol" in vep and "consequence" in vep:
            gene_name = vep["symbol"]  # vep["gene"]
            functional_class = vep["consequence"]
        else:
            gene_name = functional_class = ""
            print(
                "ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s"
                % locals())
        if genotype.num_alt is None:
            s = "\n\n"
            for i, g in v.genotypes.items():
                s += str(i) + ": " + str(g) + "\n"
            raise ValueError("genotype.num_alt is None: " + str(genotype) +
                             "\n" + str(v.toJSON()) + "\n" + s)

        genotype_str = genotype_map[genotype.num_alt]

        variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt)
        if "hgvsc" in vep and "hgvsp" in vep:
            #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
            hgvs_c = urllib.unquote(vep["hgvsc"])
            hgvs_p = urllib.unquote(vep["hgvsp"])
        else:
            hgvs_c = hgvs_p = ""
            #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        rsid = annot["rsid"] or ""

        #rsid = vep["clinvar_rs"]

        exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(
            chrom, pos, ref, alt)
        if exac_global_af is None:
            exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]"
        else:
            exac_global_af_annot = str(annot["freqs"]["exac_v3"])
            if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01:
                print(
                    "Error annot['freqs']['exac_v3']  (%s) doesn't match %s" %
                    (float(exac_global_af), float(exac_global_af_annot)))

        clinvar_clinsig = ""
        clinvar_clnrevstat = ""

        if "clin_sig" in vep:
            clinvar_clinsig_from_dbnsfp = vep["clin_sig"]
        else:
            clinvar_clinsig_from_dbnsfp = ""
            #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        clinvar_records = [
            record
            for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos)
            if record.POS == pos and record.REF == ref
        ]

        #if clinvar_clinsig_from_dbnsfp or clinvar_records:
        # defensive programming
        #if clinvar_clinsig_from_dbnsfp and not clinvar_records:
        #    raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str)
        #if not clinvar_clinsig_from_dbnsfp and clinvar_records:
        #    raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str)

        if clinvar_records:
            #if len(clinvar_records) > 1:
            #    raise ValueError("multiple clinvar records found for variant: %s" % variant_str)
            clinvar_record = clinvar_records[-1]
            clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"])
            clinvar_alleles = map(str,
                                  [clinvar_record.REF] + clinvar_record.ALT)
            xbrowse_alleles = map(str, [ref] + [alt])
            clinvar_value_indexes_to_use = [
                i for i, clinvar_allele_index in enumerate(
                    clinvar_allele_indexes)
                if str(clinvar_alleles[clinvar_allele_index]).upper() in
                xbrowse_alleles
            ]
            clnrevstat = clinvar_record.INFO["CLNREVSTAT"]
            clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use]
            clnsig = clinvar_record.INFO["CLNSIG"]
            clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use]
            # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO))
            if clnsig:
                clinvar_clinsig_numbers = map(int, clnsig[0].split("|"))
                clinvar_clinsig = "|".join(
                    set([
                        clinsig_map[clinvar_clinsig_number][0]
                        for clinvar_clinsig_number in clinvar_clinsig_numbers
                    ]))

                clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|")))

        # get
        number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]"
        clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term=" + chrom_without_chr + "[chr]+AND+" + str(
            pos) + "[chrpos37]"
        if clinvar_clinsig and all_fields:
            print("Reading from: " + clinvar_url)
            url_opener = urllib2.build_opener()
            url_opener.addheaders = [(
                'User-agent',
                "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"
            )]
            page_contents = url_opener.open(clinvar_url).read()
            match = re.search("(\d) star.? out of maximum of 4 stars",
                              page_contents)
            if match:
                number_of_stars = int(match.group(1))
            else:
                print("No match in page: " + clinvar_url)
                for line in page_contents.split("\n"):
                    if "rev_stat_text hide" in line:
                        print(
                            " -- this line was expected to contain number of stars: "
                            + line)

        row = map(str, [
            gene_name, genotype_str, variant_str, functional_class, hgvs_c,
            hgvs_p, rsid, exac_global_af, exac_popmax_af,
            exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat,
            number_of_stars, clinvar_url, comments
        ])
        return row
Exemple #18
0
    def add_preannotated_vcf_file(self,
                                  vcf_file_path,
                                  force=False,
                                  start_from_chrom=None,
                                  end_with_chrom=None):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force and self._db.vcf_files.find_one(
            {'vcf_file_path': vcf_file_path}):
            print "VCF %(vcf_file_path)s already loaded into db.variants cache" % locals(
            )
            return

        r = vcf.VCFReader(filename=vcf_file_path)
        if "CSQ" not in r.infos:
            raise ValueError(
                "ERROR: CSQ field not found in %s. Was this VCF annotated with VEP?"
                % vcf_file_path)

        expected_csq_fields = set(
            "Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|BIOTYPE|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|EXON|INTRON|DOMAINS|HGVSc|HGVSp|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|Polyphen2_HVAR_pred|CADD_phred|MutationTaster_pred|MetaSVM_pred|SIFT_pred|FATHMM_pred"
            .split("|"))
        actual_csq_fields_string = str(
            r.infos["CSQ"].desc).split("Format:")[1].strip()
        actual_csq_fields = set(actual_csq_fields_string.split("|"))
        if len(expected_csq_fields - actual_csq_fields) > 0:
            raise ValueError(
                "ERROR: VEP did not add all expected CSQ fields to the VCF. The VCF's CSQ = %s and is missing these fields: %s"
                % (actual_csq_fields_string,
                   expected_csq_fields - actual_csq_fields))

        if start_from_chrom or end_with_chrom:
            if start_from_chrom:
                print("Start chrom: chr%s" % start_from_chrom)
            if end_with_chrom:
                print("End chrom: chr%s" % end_with_chrom)

            chrom_list = list(map(str, range(1, 23))) + ['X', 'Y']
            chrom_list_start_index = 0
            if start_from_chrom:
                chrom_list_start_index = chrom_list.index(
                    start_from_chrom.replace("chr", "").upper())

            chrom_list_end_index = len(chrom_list)
            if end_with_chrom:
                chrom_list_end_index = chrom_list.index(
                    end_with_chrom.replace("chr", "").upper())

            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = tabix_file.header
            for chrom in chrom_list[
                    chrom_list_start_index:chrom_list_end_index + 1]:
                print("Will load chrom: " + chrom)
                try:
                    vcf_iter = itertools.chain(vcf_iter,
                                               tabix_file.fetch(chrom))
                except ValueError as e:
                    print("WARNING: add_preannotated_vcf_file: " + str(e))

            vcf_file_obj = vcf_iter
        else:
            print("Loading pre-annotated VCF file: %s into db.variants cache" %
                  vcf_file_path)
            vcf_file_obj = gzip.open(vcf_file_path) if vcf_file_path.endswith(
                '.gz') else open(vcf_file_path)

        counters = defaultdict(int)
        for variant, vep_annotation in vep_annotations.parse_vep_annotations_from_vcf(
                vcf_file_obj):
            # for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t = variant.unique_tuple()
            counters['alleles'] += 1
            annotation = {
                'vep_annotation':
                vep_annotation,
                'freqs':
                self._population_frequency_store.get_frequencies(
                    variant_t[0], variant_t[1], variant_t[2]),
            }

            add_convenience_annotations(annotation)

            chrom, pos = genomeloc.get_chr_pos(variant_t[0])

            worst_annotation = vep_annotation[
                annotation["worst_vep_annotation_index"]]
            predictors = get_predictors(worst_annotation)
            annotation.update(predictors)
            #if self._custom_annotator:
            #    custom_annotations = self._custom_annotator.get_annotations_for_variants([variant_t])
            #    annotation.update(custom_annotations[variant_t])

            if counters['alleles'] % 10000 == 0:
                import pprint
                pprint.pprint(variant_t)

            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {'$set': {
                    'annotation': annotation
                }},
                upsert=True)

        print("Finished parsing %s alleles from %s" %
              (counters.get('alleles', 0), vcf_file_path))
        self._db.vcf_files.update({'vcf_file_path': vcf_file_path}, {
            'vcf_file_path': vcf_file_path,
            'date_added': datetime.datetime.utcnow()
        },
                                  upsert=True)
    def get_output_row(self, variant, xpos, ref, alt, individual_id, family, all_fields=False, comments="", gene_id=""):
        v = variant
        if individual_id not in v.genotypes:
            print("skipping variant: %s because individual %s not in %s" % (str(xpos) + " " + ref + ">" + alt, individual_id, family.family_id))
            return None

        gene_id = gene_id.split(".")[0] if gene_id else None  # strip off the gene_id suffix (eg. '.3')

        genotype = v.genotypes[individual_id]
        if genotype.gq is None:
            print("skipping variant: %s because this variant is not called in this individual (%s)"  % (str(xpos)+" " + ref + ">" + alt, individual_id)) #, str(genotype)))
            return None

        chrom, pos = genomeloc.get_chr_pos(xpos)
        chrom_without_chr = chrom.replace("chr", "")

        annot = v.annotation
        if gene_id:
            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id = gene_id)
        else:
            # create dictionary that maps gene id to the index of the worst vep annotation for that gene
            protein_coding_gene_ids = set(a['gene'] for a in annot["vep_annotation"] if a['biotype'] == 'protein_coding')
            if not protein_coding_gene_ids:
                print("skipping variant %s in this individual (%s) because none of the transcripts are protein coding: %s"  % (str(xpos)+" " + ref + ">" + alt, individual_id, annot))
                return None

            worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"], gene_id=protein_coding_gene_ids)
            if len(protein_coding_gene_ids) > 1:
                selected_gene_id = annot["vep_annotation"][worst_vep_annotation_index]['gene']
                print("Selected %s from %s" % (annot["vep_annotation"][worst_vep_annotation_index]['symbol'], set([a['symbol'] for a in annot["vep_annotation"] if a['gene'] in protein_coding_gene_ids])))

        vep = annot["vep_annotation"][worst_vep_annotation_index]  # ea_maf, swissprot, existing_variation, pubmed, aa_maf, ccds, high_inf_pos, cdna_position, canonical, tsl, feature_type, intron, trembl, feature, codons, polyphen, clin_sig, motif_pos, protein_position, afr_maf, amino_acids, cds_position, symbol, uniparc, eur_maf, hgnc_id, consequence, sift, exon, biotype, is_nc, gmaf, motif_name, strand, motif_score_change, distance, hgvsp, ensp, allele, symbol_source, amr_maf, somatic, hgvsc, asn_maf, is_nmd, domains, gene

        worst_vep_annotation_index = vep_annotations.get_worst_vep_annotation_index(annot["vep_annotation"])

        vep = annot["vep_annotation"][worst_vep_annotation_index]

        if "symbol" in vep and "consequence"in vep:
            gene_name = vep["symbol"]  # vep["gene"]
            functional_class = vep["consequence"]
        else:
            gene_name = functional_class = ""
            print("ERROR: gene_name and functional_class not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
        if genotype.num_alt is None:
            s = "\n\n"
            for i, g in v.genotypes.items():
                s += str(i) + ": " + str(g) + "\n"
            raise ValueError("genotype.num_alt is None: " + str(genotype) + "\n" + str(v.toJSON()) + "\n" + s)

        genotype_str = genotype_map[genotype.num_alt]

        variant_str = "%s:%s %s>%s" % (chrom, pos, ref, alt)
        if "hgvsc" in vep and "hgvsp"in vep:
            #print("hgvs_c and/or hgvs_p WAS found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())
            hgvs_c = urllib.unquote(vep["hgvsc"])
            hgvs_p = urllib.unquote(vep["hgvsp"])
        else:
            hgvs_c = hgvs_p = ""
            #print("ERROR: hgvs_c and/or hgvs_p not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())

        rsid = annot["rsid"] or ""

        #rsid = vep["clinvar_rs"]

        exac_global_af, exac_popmax_af, exac_popmax_population = get_exac_af(chrom, pos, ref, alt)
        if exac_global_af is None:
             exac_global_af, exac_popmax_af, exac_popmax_population = 0, 0, "[variant not found in ExACv0.3]"
        else:
            exac_global_af_annot = str(annot["freqs"]["exac_v3"])
            if abs(float(exac_global_af) - float(exac_global_af_annot)) > 0.01:
                print("Error annot['freqs']['exac_v3']  (%s) doesn't match %s" % (float(exac_global_af), float(exac_global_af_annot)))

        clinvar_clinsig = ""
        clinvar_clnrevstat = ""

        if "clin_sig" in vep:
            clinvar_clinsig_from_dbnsfp = vep["clin_sig"]
        else:
            clinvar_clinsig_from_dbnsfp = ""
            #print("ERROR: clin_sig not found in annot['vep_annotation'][annot['worst_vep_annotation_index']]: %(vep)s" % locals())


        clinvar_records = [record for record in clinvar_vcf_file.fetch(chrom_without_chr, pos, pos) if record.POS == pos and record.REF == ref]


        #if clinvar_clinsig_from_dbnsfp or clinvar_records:
            # defensive programming
            #if clinvar_clinsig_from_dbnsfp and not clinvar_records:
            #    raise ValueError("record has dbNSFP clinvar entry but is not in clinvar vcf: %s" % variant_str)
            #if not clinvar_clinsig_from_dbnsfp and clinvar_records:
            #    raise ValueError("record doesn't have a dbNSFP clinvar entry but is in clinvar vcf: %s" % variant_str)

        if clinvar_records:
            #if len(clinvar_records) > 1:
            #    raise ValueError("multiple clinvar records found for variant: %s" % variant_str)
            clinvar_record = clinvar_records[-1]
            clinvar_allele_indexes = map(int, clinvar_record.INFO["CLNALLE"])
            clinvar_alleles = map(str, [clinvar_record.REF] + clinvar_record.ALT)
            xbrowse_alleles = map(str, [ref] + [alt])
            clinvar_value_indexes_to_use = [i for i, clinvar_allele_index in enumerate(clinvar_allele_indexes) if str(clinvar_alleles[clinvar_allele_index]).upper() in xbrowse_alleles]
            clnrevstat = clinvar_record.INFO["CLNREVSTAT"]
            clnrevstat = [clnrevstat[i] for i in clinvar_value_indexes_to_use]
            clnsig = clinvar_record.INFO["CLNSIG"]
            clnsig = [clnsig[i] for i in clinvar_value_indexes_to_use]
            # print("Fetched clinvar %s: %s"% (clinvar_record, clinvar_record.INFO))
            if clnsig:
                clinvar_clinsig_numbers = map(int, clnsig[0].split("|"))
                clinvar_clinsig = "|".join(set([clinsig_map[clinvar_clinsig_number][0] for clinvar_clinsig_number in clinvar_clinsig_numbers]))

                clinvar_clnrevstat = "|".join(set(clnrevstat[0].split("|")))

        # get
        number_of_stars = "[not found]" if all_fields else "[not retrieved to save time]"
        clinvar_url = "http://www.ncbi.nlm.nih.gov/clinvar/?term="+chrom_without_chr+"[chr]+AND+"+str(pos)+"[chrpos37]"
        if clinvar_clinsig and all_fields:
            print("Reading from: " + clinvar_url)
            url_opener = urllib2.build_opener()
            url_opener.addheaders = [('User-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11")]
            page_contents = url_opener.open(clinvar_url).read()
            match = re.search("(\d) star.? out of maximum of 4 stars", page_contents)
            if match:
                number_of_stars = int(match.group(1))
            else:
                print("No match in page: " + clinvar_url)
                for line in page_contents.split("\n"):
                    if "rev_stat_text hide" in line:
                        print(" -- this line was expected to contain number of stars: " + line)

        row = map(str, [gene_name, genotype_str, variant_str, functional_class, hgvs_c, hgvs_p, rsid, exac_global_af, exac_popmax_af, exac_popmax_population, clinvar_clinsig, clinvar_clnrevstat, number_of_stars, clinvar_url, comments])
        return row
Exemple #20
0
    def handle(self, *args, **options):
        number_of_variants_to_check = int(
            options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project).annotator.get_annotation(
                                variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(
                                    variant.xpos)
                                chrom = chrom.replace("chr", "")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append(
                                    "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" %
                                    locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(
                                not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print(
                                    "---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: "
                                    % locals())

                                for v in not_found_variants:
                                    print(
                                        "http://exac.broadinstitute.org/variant/"
                                        + v)
                            break
Exemple #21
0
 def __str__(self):
     chr, pos = genomeloc.get_chr_pos(self.xpos)
     return "%s-%s-%s-%s:%s" % (chr, pos, self.ref, self.alt,
                                self.project_tag.tag)