Ejemplo n.º 1
0
 def test_open_known_genes(self):
     """ check that opening known DD genes works correctly
     """
     
     temp = tempfile.NamedTemporaryFile()
     genes = DataFrame({
         'gene': ['TEMP1', 'TEMP2'],
         'ddg2p_status': ['Confirmed DD Gene', 'Possible DD Gene'],
         'mode': ['Monoallelic', 'X-linked dominant'],
         'mech': ['loss-of-function', 'Activating'],
         'chr': ['chr1', '1'],
         'start': [1000, 2000],
         'end': [1500, 2500],
     })
     
     expected = DataFrame({
         'gene': ['TEMP1'],
         'type': ['Confirmed DD Gene'],
         'mode': ['Monoallelic'],
         'mech': ['Loss-of-function'],
         'chr': ['1'],
         'start': [1000],
         'end': [1500],
         'dominant': [True],
         'hemizygous': [False],
     })
     
     genes.to_csv(temp.name, sep='\t', index=False)
     observed = open_known_genes(temp.name)
     self.compare_tables(observed, expected)
     
     # and check that it still works cleanly with the pipe separator
     genes.to_csv(temp.name, sep='|', index=False)
     observed = open_known_genes(temp.name)
     self.compare_tables(observed, expected)
Ejemplo n.º 2
0
    def test_open_known_genes(self):
        """ check that opening known DD genes works correctly
        """

        temp = tempfile.NamedTemporaryFile()
        genes = DataFrame({
            'gene': ['TEMP1', 'TEMP2'],
            'ddg2p_status': ['Confirmed DD Gene', 'Possible DD Gene'],
            'mode': ['Monoallelic', 'X-linked dominant'],
            'mech': ['loss-of-function', 'Activating'],
            'chr': ['chr1', '1'],
            'start': [1000, 2000],
            'end': [1500, 2500],
        })

        expected = DataFrame({
            'gene': ['TEMP1'],
            'type': ['Confirmed DD Gene'],
            'mode': ['Monoallelic'],
            'mech': ['Loss-of-function'],
            'chr': ['1'],
            'start': [1000],
            'end': [1500],
            'dominant': [True],
            'hemizygous': [False],
        })

        genes.to_csv(temp.name, sep='\t', index=False)
        observed = open_known_genes(temp.name)
        self.compare_tables(observed, expected)

        # and check that it still works cleanly with the pipe separator
        genes.to_csv(temp.name, sep='|', index=False)
        observed = open_known_genes(temp.name)
        self.compare_tables(observed, expected)
Ejemplo n.º 3
0
def get_diagnosed(diagnosed_path, updated_path, de_novo_path,
        low_pp_dnm_validations_path, known_genes_path, families_path,
        recessive_path):
    """ find probands likely to have diagnoses, to exclude them from our data
    
    Args:
        path: path to file defining diagnosed probands
    
    Returns:
        A table of probands with diagnoses
    """
    
    initial_diagnosed = get_reviewed(diagnosed_path, families_path, updated_path)
    
    known_genes = open_known_genes(known_genes_path)
    variants = get_current_de_novos(de_novo_path)
    
    # the candidates with low pp_dnm (< 0.9) in DDG2P genes were attempted to
    # validate. Those that did, we can swap the pp_dnm to 1, since these are now
    # high confidence de novos
    low_pp_dnm = get_low_pp_dnm_validations(low_pp_dnm_validations_path)
    variants = variants.merge(low_pp_dnm[["person_id", "chrom", "start_pos", "status"]],
        how="left", on=["person_id", "chrom", "start_pos"])
    variants["pp_dnm"][variants["status"] == "de_novo"] = 1
    
    # get the set of de novos from the current dataset that are likely to be
    # diagnostic. These are de novos in genes with dominant modes of inheritance,
    # or chrX de novos in males in genes with hemizygous mode of inheritance,
    # and where the site is high confidence (as determined by having a high
    # pp_dnm, or a missing pp_dnm)
    dominant = known_genes["gene"][known_genes["dominant"]]
    hemizygous = known_genes["gene"][known_genes["hemizygous"]]
    likely_diagnostic = variants[(variants["hgnc"].isin(dominant) | \
        (variants["hgnc"].isin(hemizygous) & (variants["sex"] == "male")))
        & ((variants["pp_dnm"] > 0.1) | variants["pp_dnm"].isnull())]
    
    # define the sufficiently pathogenic consequences
    permitted = ["missense_variant", "frameshift_variant", "stop_gained",
        "splice_donor_variant", "splice_acceptor_variant", "inframe_deletion",
         "conserved_exon_terminus_variant", "initiator_codon_variant",
         "inframe_insertion"]
    
    # remove the nonfunctional variants
    likely_diagnostic = likely_diagnostic[likely_diagnostic["consequence"].isin(permitted)]
    likely_diagnostic = likely_diagnostic[["person_id", "sex", "chrom", "start_pos",
        "end_pos", "ref_allele", "alt_allele", "hgnc", "inheritance", "confirmed", "type"]]
    
    # remove the sites from the likely diagnoses that form part of a confirmed
    # diagnosis
    in_prev = likely_diagnostic.apply(axis=1, func=check_for_match, initial=initial_diagnosed)
    likely_diagnostic = likely_diagnostic[~in_prev]
    
    diagnosed = initial_diagnosed.append(likely_diagnostic, ignore_index=True)
    
    if recessive_path is not None:
        recessive = pandas.read_table(recessive_path, sep="\t")
        diagnosed = diagnosed.append(recessive, ignore_index=True)
    
    return diagnosed
Ejemplo n.º 4
0
def open_external_variants(meta_variants, meta_subset, diagnosed_path, known_genes_path):
    variants = pandas.read_table(meta_variants, sep="\t", compression="gzip")
    if meta_subset is not None:
        variants = variants[variants["study_phenotype"].isin(meta_subset.split(","))]
    
    if diagnosed_path is not None:
        known_genes = open_known_genes(known_genes_path)
        variants = variants[~(variants["hgnc"].isin(known_genes["gene"][known_genes["dominant"]]) |
            ((variants["sex"] == "male") & variants["hgnc"].isin(known_genes["gene"][known_genes["hemizygous"]])))]
    
    return variants
Ejemplo n.º 5
0
def include_known_gene_status(merged, known_gene_path):
    """ annotate genes with their known developmental disorder status
    """
    
    # load the table of known developmental disorder genes
    known_gene = open_known_genes(known_gene_path)
    
    # annotate each column with DDG2P status
    merged["known"] = merged["hgnc"].isin(known_gene["gene"])
    merged["known_dominant"] = merged["hgnc"].isin(known_gene["gene"][known_gene["dominant"]])
    
    return merged
Ejemplo n.º 6
0
def count_external_trios(meta_cohort,
                         meta_variants,
                         known_genes_path,
                         diagnosed,
                         meta_subset=None):
    """ defines the cohort sizes, used to get the overall population size
    
    Args:
        meta_cohort: path to table of counts of probands in external exome and
            genome sequencing studies.
        meta_variants: path to table of de novo mutations from external exome
            and genome sequencing studies.
        known_genes_path: path to table of known developmental disorder genes
        remove_diagnosed: boolean of whether to remove probands with diagnostic
            variants.
        meta_subset: string of comma-separated list of phenotypes to include in
            the meta-analysis, or None.
    
    Returns:
        tuple of male and female proband counts.
    """

    cohorts = pandas.read_table(meta_cohort, sep="\t")
    if meta_subset is not None:
        cohorts = cohorts[cohorts["study_phenotype"].isin(
            meta_subset.split(","))]

    male = sum(cohorts["unique_male"])
    female = sum(cohorts["unique_female"])

    if diagnosed is not None:
        variants = pandas.read_table(meta_variants,
                                     sep="\t",
                                     compression="gzip")

        if meta_subset is not None:
            variants = variants[variants["study_phenotype"].isin(
                meta_subset.split(","))]

        known_genes = open_known_genes(known_genes_path)
        diagnosed = variants[
            variants["hgnc"].isin(known_genes["gene"][known_genes["dominant"]])
            | ((variants["sex"] == "male") & variants["hgnc"].
               isin(known_genes["gene"][known_genes["hemizygous"]]))]
        diagnosed = diagnosed[~diagnosed[["person_id", "sex"]].duplicated()]

        # decrement for the diagnosed external individuals of each sex
        male -= sum(diagnosed["sex"] == "male")
        female -= sum(diagnosed["sex"] == "female")

    return (male, female)
Ejemplo n.º 7
0
def open_external_variants(meta_variants, meta_subset, diagnosed_path,
                           known_genes_path):
    variants = pandas.read_table(meta_variants, sep="\t", compression="gzip")
    if meta_subset is not None:
        variants = variants[variants["study_phenotype"].isin(
            meta_subset.split(","))]

    if diagnosed_path is not None:
        known_genes = open_known_genes(known_genes_path)
        variants = variants[~(
            variants["hgnc"].isin(known_genes["gene"][known_genes["dominant"]])
            | ((variants["sex"] == "male") & variants["hgnc"].
               isin(known_genes["gene"][known_genes["hemizygous"]])))]

    return variants
Ejemplo n.º 8
0
def count_external_trios(meta_cohort, meta_variants, known_genes_path, diagnosed, meta_subset=None):
    """ defines the cohort sizes, used to get the overall population size
    
    Args:
        meta_cohort: path to table of counts of probands in external exome and
            genome sequencing studies.
        meta_variants: path to table of de novo mutations from external exome
            and genome sequencing studies.
        known_genes_path: path to table of known developmental disorder genes
        remove_diagnosed: boolean of whether to remove probands with diagnostic
            variants.
        meta_subset: string of comma-separated list of phenotypes to include in
            the meta-analysis, or None.
    
    Returns:
        tuple of male and female proband counts.
    """
    
    cohorts = pandas.read_table(meta_cohort, sep="\t")
    if meta_subset is not None:
        cohorts = cohorts[cohorts["study_phenotype"].isin(meta_subset.split(","))]
    
    male = sum(cohorts["unique_male"])
    female = sum(cohorts["unique_female"])
    
    if diagnosed is not None:
        variants = pandas.read_table(meta_variants, sep="\t", compression="gzip")
        
        if meta_subset is not None:
            variants = variants[variants["study_phenotype"].isin(meta_subset.split(","))]
        
        known_genes = open_known_genes(known_genes_path)
        diagnosed = variants[variants["hgnc"].isin(known_genes["gene"][known_genes["dominant"]]) |
            ((variants["sex"] == "male") & variants["hgnc"].isin(known_genes["gene"][known_genes["hemizygous"]]))]
        diagnosed = diagnosed[~diagnosed[["person_id", "sex"]].duplicated()]
        
        # decrement for the diagnosed external individuals of each sex
        male -= sum(diagnosed["sex"] == "male")
        female -= sum(diagnosed["sex"] == "female")
    
    return (male, female)
Ejemplo n.º 9
0
def get_diagnosed(diagnosed_path, updated_path, de_novo_path,
                  low_pp_dnm_validations_path, known_genes_path, families_path,
                  recessive_path):
    """ find probands likely to have diagnoses, to exclude them from our data
    
    Args:
        path: path to file defining diagnosed probands
    
    Returns:
        A table of probands with diagnoses
    """

    initial_diagnosed = get_reviewed(diagnosed_path, families_path,
                                     updated_path)

    known_genes = open_known_genes(known_genes_path)
    variants = get_current_de_novos(de_novo_path)

    # the candidates with low pp_dnm (< 0.9) in DDG2P genes were attempted to
    # validate. Those that did, we can swap the pp_dnm to 1, since these are now
    # high confidence de novos
    low_pp_dnm = get_low_pp_dnm_validations(low_pp_dnm_validations_path)
    variants = variants.merge(
        low_pp_dnm[["person_id", "chrom", "start_pos", "status"]],
        how="left",
        on=["person_id", "chrom", "start_pos"])
    variants["pp_dnm"][variants["status"] == "de_novo"] = 1

    # get the set of de novos from the current dataset that are likely to be
    # diagnostic. These are de novos in genes with dominant modes of inheritance,
    # or chrX de novos in males in genes with hemizygous mode of inheritance,
    # and where the site is high confidence (as determined by having a high
    # pp_dnm, or a missing pp_dnm)
    dominant = known_genes["gene"][known_genes["dominant"]]
    hemizygous = known_genes["gene"][known_genes["hemizygous"]]
    likely_diagnostic = variants[(variants["hgnc"].isin(dominant) | \
        (variants["hgnc"].isin(hemizygous) & (variants["sex"] == "male")))
        & ((variants["pp_dnm"] > 0.1) | variants["pp_dnm"].isnull())]

    # define the sufficiently pathogenic consequences
    permitted = [
        "missense_variant", "frameshift_variant", "stop_gained",
        "splice_donor_variant", "splice_acceptor_variant", "inframe_deletion",
        "conserved_exon_terminus_variant", "initiator_codon_variant",
        "inframe_insertion"
    ]

    # remove the nonfunctional variants
    likely_diagnostic = likely_diagnostic[
        likely_diagnostic["consequence"].isin(permitted)]
    likely_diagnostic = likely_diagnostic[[
        "person_id", "sex", "chrom", "start_pos", "end_pos", "ref_allele",
        "alt_allele", "hgnc", "inheritance", "confirmed", "type"
    ]]

    # remove the sites from the likely diagnoses that form part of a confirmed
    # diagnosis
    in_prev = likely_diagnostic.apply(axis=1,
                                      func=check_for_match,
                                      initial=initial_diagnosed)
    likely_diagnostic = likely_diagnostic[~in_prev]

    diagnosed = initial_diagnosed.append(likely_diagnostic, ignore_index=True)

    if recessive_path is not None:
        recessive = pandas.read_table(recessive_path, sep="\t")
        diagnosed = diagnosed.append(recessive, ignore_index=True)

    return diagnosed