Ejemplo n.º 1
0
async def change_build(args):
    ''' shift variants onto a new genome build
    '''
    yield args.input.readline()

    for line in args.input:
        var = DeNovo(*line.strip('\n').split('\t'))
        if not var:
            continue
        remapped = var.to_build(args.to)
        if remapped:
            yield str(remapped) + '\n'
Ejemplo n.º 2
0
async def an_science_de_novos(result):
    """ get de novo mutations from An et al, Autism dataset
    
    Table S2 from:
    An et al. Science 362: eaat6576, doi: 10.1126/science.aat6576
    """
    logging.info('getting An et al Science 2018 de novos')
    with warnings.catch_warnings():
        # suppress warning about unknown extension that doesn't affect loading data
        warnings.simplefilter('ignore')
        data = pandas.read_excel(url,
                                 sheet_name='Table S2 de novo mutations',
                                 skiprows=1,
                                 usecols=list(range(8)),
                                 engine='openpyxl')

    data['chrom'] = data['Chr'].astype(str)

    data['SampleID'] += '|asd_cohorts'
    data['study'] = '10.1126/science.aat6576'
    data['confidence'] = 'high'

    vars = set()
    for row in data.itertuples():
        var = DeNovo(row.SampleID, row.chrom, row.Pos, row.Ref, row.Alt,
                     row.study, row.confidence, 'grch38')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 3
0
async def halldorsson_science_de_novos(result):
    """ get de novo variants for Halldorsson et al Science 2019
    
    Supplementary Data 5 (revised) from:
    Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043
    
    Halldorsson supercedes Jonsson et al, since at least 99.2% of the Jonsson et al
    samples occur in Halldorsson. See dnm_cohorts.halldorsson_check.py for more details.
    """
    logging.info('getting Halldorsson et al Science 2019 de novos')
    with tempfile.NamedTemporaryFile() as temp:
        # the url redirects, so use the requests package to open the URL
        download_file(url, temp.name)
        df = pandas.read_table(temp.name, comment='#')
    
    df['person_id'] = df['Proband_id'].astype(str)
    df['person_id'] += '|halldorsson'
    df['chrom'] = df['Chr'].astype(str)
    df['pos'] = df['Pos']
    df['ref'] = df['Ref']
    df['alt'] = df['Alt']
    df['study'] = '10.1126/science.aau1043'
    df['confidence'] = 'high'
    df['build'] = 'grch38'
    
    variants = set()
    for row in df.itertuples():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
            row.study, row.confidence, row.build)
        variants.add(var)
    
    result.append(variants)
Ejemplo n.º 4
0
async def homsy_science_de_novos(result):
    """ get de novo variants for Homsy et al Science 2015
    
    Supplementary Database 1 from:
    Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396
    """
    logging.info('getting Homsy et al Science 2015 de novos')
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    with ZipFile(zipf.name) as zipped:
        handle = zipped.open('homsy_database_S02.xlsx')
        data = pandas.read_excel(handle, 'Database S2', skiprows=1)

    data['person_id'] = data['Blinded ID'].astype(str)
    data['person_id'] += '|homsy'
    data['chrom'] = data['CHROM'].astype(str)
    data['pos'] = data['POS']
    data['ref'] = data['REF']
    data['alt'] = data['ALT']
    data['study'] = '10.1126/science.aac9396'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 5
0
async def epi4k_ajhg_de_novos(result, limiter):
    """ get de novo data for the Epi4K epilepsy exome study
    
    De novo mutation data from the most recent EPI4K publication:
    Supplementary table 1:
    American Journal of Human Genetics (2014) 95:360-370
    doi: 10.1016/j.ajhg.2014.08.013
    
    This incorporates the de novo mutation data from supplementary table 2 of:
    Allen et al. (2013) Nature 501:217-221
    doi: 10.1038/nature12439
    
    Returns:
        data.frame of de novo mutations
    """
    logging.info('getting Epi4K et al AJHG 2014 de novos')
    data = pandas.read_excel(url, skipfooter=4)

    data['chrom'], data['pos'], data['ref'], data['alt'] = await fix_coordinates_with_allele(limiter, \
        data['hg19 coordinates (chr:position)'], data["Ref/Alt alleles"])

    data['study'] = "10.1016/j.ajhg.2014.08.013"

    data['person_id'] = get_person_ids(data)
    data['person_id'] += '|epi4k'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 6
0
async def mcrae_nature_de_novos(result):
    """ load de novo mutations from McRae et al Nature 2017
    
    These de novos are loaded from Supplementary Table 1 from
    McRae et al Nature 2017 542:433-438
    doi: 10.1038/nature21062
    
    Returns:
        dataframe of de novo mutations
    """
    logging.info('getting McRae et al Nature 2017 de novos')
    data = pandas.read_excel(url, sheet_name='Supplementary Table 1')

    data['person_id'] = data['Individual ID']
    data['chrom'] = data['Chromosome'].astype(str)
    data['pos'] = data['Position (GRCh37)']
    data['ref'] = data['Reference allele']
    data['alt'] = data['Alternate allele']

    data['person_id'] += '|DDD'
    data['study'] = '10.1038/nature21062'

    qual, status = data['PP(DNM)'], data['Status']
    quality = qual.isnull() | (qual > 0.00781) | (status == 'validated')
    data['confidence'] = quality.map({True: 'high', False: 'low'})

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 7
0
async def de_ligt_nejm_de_novos(result, limiter):
    """ get de novo mutations from De Ligt et al., 2012
    
    De Ligt et al., (2012) N Engl J Med 367:1921-1929
    doi:10.1056/NEJMoa1206524
    
    Variants sourced from Supplementary Table S3.
    """
    logging.info('getting De ligt et al NEJM 2012 de novos')
    temp = tempfile.NamedTemporaryFile()
    download_with_cookies(url, temp.name)
    data = extract_table(temp)
    data = clean_table(data)
    
    chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter, data.hgvs_genomic)
    data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt
    
    data['person_id'] += '|de_ligt'
    data['study'] = "10.1056/NEJMoa1206524"
    data['confidence'] = 'high'
    
    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
            row.study, row.confidence, 'grch37')
        vars.add(var)
    
    result.append(vars)
Ejemplo n.º 8
0
async def sanders_neuron_de_novos(result):
    """ get de novo data from the Sanders et al Neuron autism exome study
    
    Supplementary table 5 from:
    Sanders et al. (2015) Neuron 87:1215-1233
    doi: 10.1016/j.neuron.2015.09.016
    
    Returns:
        data frame of de novos, with standardised genome coordinates and VEP
        consequences for each variant
    """
    logging.info('getting Sanders et al Neuron 2012 de novos')
    data = pandas.read_excel(url, sheet_name='Exome')
    
    # remove some sibs with bad IDs. These sibs are not in the cohort table.
    remove = {'13930.s1', '12675.s1', '12707.s1', '11931.s1', '13867.s1', '14636.s1'}
    data = data[~data.patientID.isin(remove)]
    
    data['person_id'] = data['patientID'].astype(str) + '|asd_cohorts'
    data['chrom'] = data['Chr'].astype(str)
    data['pos'] = data['Pos(hg19)']
    data['ref'] = data['Ref']
    data['alt'] = data['Alt']
    data['study'] = "10.1016/j.neuron.2015.09.016"
    
    quality = data['Confidence'] != 'lowConf'
    data['confidence'] = quality.map({True: 'high', False: 'low'})
    
    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
            row.study, row.confidence, 'grch37')
        vars.add(var)
    
    result.append(vars)
Ejemplo n.º 9
0
async def kaplanis_nature_de_novos(result, limiter):
    """ load de novo mutations from Kaplanis et al Nature 2020
    
    These de novos are loaded from Supplementary Table 1 from
    Kaplanis et al Nature 2020
    doi: 10.1038/s41586-020-2832-5
    
    Returns:
        dataframe of de novo mutations
    """
    logging.info('getting Kaplanis et al Nature 2019 de novos')
    data = pandas.read_table(url)

    data['person_id'] = data['id'] + '|' + data['study']
    data['chrom'] = data['chrom'].astype(str)
    data['study'] = '10.1038/s41586-020-2832-5'
    data['confidence'] = 'high'
    data['build'] = 'grch37'

    # fix RUMC indels, as insertions lack ref alleles and deletions lack alts
    data['ref'], data['alt'] = await fix_alleles(limiter, data)

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)
Ejemplo n.º 10
0
async def gilissen_nature_de_novos(result, limiter):
    """ load de novos from Gilissen et al Nature 2014
    
    Nature 511: 344-347 2014, doi:10.1038/nature13394
    Supplementary table S8.
    """
    logging.info('getting Gilissen et al Nature 2014 de novos')
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    data = extract_table(temp)
    data = clean_table(data)

    chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter,
                                                      data.hgvs_genomic)
    data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt

    data['person_id'] += '|de_ligt'
    data['study'] = '10.1038/nature13394'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 11
0
def open_de_novos(path=None):
    ''' opens de novos, loads file from repo by default
    
    Pass 'grch37' or 'grch38' to open variants lifted to that build.
    '''
    if not path:
        path = DE_NOVO_PATH
    elif isinstance(path, str) and path.lower() == 'grch37':
        path = DE_NOVO_PATH_b37
    elif isinstance(path, str) and path.lower() == 'grch38':
        path = DE_NOVO_PATH_b38

    with gzip.open(path, 'rt') as handle:
        header = handle.readline()
        return [DeNovo(*x.strip('\n').split('\t')) for x in handle]
Ejemplo n.º 12
0
async def iossifov_neuron_de_novos(result, limiter):
    """ get de novo data from the 2012 Iossifov et al autism exome study in Neuron
    
    Supplementary table 1 (where the non-coding SNVs have been excluded) and
    supplementary table 2 from:
    Iossifov et al. (2012) Neuron 74:285-299
    doi: 10.1016/j.neuron.2012.04.009
    
    Returns:
        data frame of de novos, with standardised genome coordinates and VEP
        consequences for each variant.
    """
    logging.info('getting Iossifov et al Neuron 2012 de novos')
    snvs = pandas.read_excel(snv_url, sheet_name='SNV.v4.1-normlized')
    indels = pandas.read_excel(indel_url, sheet_name='ID.v4.1-normlized')

    # trim out the low quality de novos (as defined by a flag in the table)
    snvs = snvs[snvs['SNVFilter']]
    indels = indels[indels['IndelFilter']]

    # merge the SNV and indel de novo calls
    snvs = snvs[[
        'quadId', 'location', 'variant', 'effectGenes', 'effectType', 'inChild'
    ]]
    indels = indels[[
        'quadId', 'location', 'variant', 'effectGenes', 'effectType', 'inChild'
    ]]
    data = snvs.append(indels, ignore_index=True)

    # get the coordinates
    coords = await fix_coordinates_with_allele(limiter, data['location'],
                                               data['variant'])
    data['chrom'], data['pos'], data['ref'], data['alt'] = coords

    data['person_id'] = get_person_ids(data)
    data = tidy_families(data)

    data['person_id'] += '|asd_cohorts'
    data['study'] = '10.1016/j.neuron.2012.04.009'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 13
0
async def jonsson_nature_de_novos(result):
    """ get de novo variants for Jonsson et al Nature 2017
    
    This has been superceded by Haldorsson et al, since 99.2% of teh samples
    from Jonsson et al exist in Haldorsson et al.
    
    Supplementary Table 4 from:
    Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018
    """
    logging.info('getting Jonsson et al Nature 2017 de novos')
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    # open the zipfile, then open a tarfile inside the zip, then extract and
    # read from file inside the tar
    path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz'
    with ZipFile(
            zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar:
        member = tar.getmember('decode_DNMs/decode_DNMs.tsv')
        data = pandas.read_table(tar.extractfile(member))

    data['person_id'] = data['Proband_nr'].astype(str)
    data['person_id'] += '|jonsson'
    data['chrom'] = data['Chr'].astype(str)
    data['pos'] = data['Pos_hg38']
    data['ref'] = data['Ref']
    data['alt'] = data['Alt']
    data['study'] = '10.1038/nature24018'
    data['confidence'] = 'high'
    data['build'] = 'grch38'

    # remove individuals who were children of other probands
    child_ids = set(data.person_id[data.Phase_source == 'three_generation'])
    data = data[~data.person_id.isin(child_ids)]

    vars = set()
    for row in data.itertuples():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)
Ejemplo n.º 14
0
async def oroak_nature_de_novos(result, limiter):
    """ get de novo data from the O'Roak et al autism exome study
    
    Supplementary table 3 from:
    O'Roak et al. (2012) Nature 485:246-250
    doi: 10.1038/nature10989
    
    Returns:
        data frame of de novos, with standardised genome coordinates and VEP
        consequences for each variant
    """
    logging.info('getting O\'Roak et al Nature 2012 de novos')
    data = pandas.read_excel(url,
                             sheet_name="Supplementary Table 3",
                             skipfooter=3,
                             engine='xlrd')

    # standardise the chrom, position and allele column names
    data['chrom'] = data['Chromosome'].astype(str)
    data['pos'] = data['Position (hg19)'].astype(int)
    data['ref'] = data['Ref']
    data['alt'] = data['Allele']
    data['build'] = 'grch37'

    data['alt'] = tidy_complex_alts(data['alt'])
    data['ref'], data['alt'] = await fix_alleles(limiter, data)

    alleles = [fix_het_alleles(x.ref, x.alt) for i, x in data.iterrows()]
    data['ref'], data['alt'] = list(zip(*alleles))

    data['person_id'] = data['Person'] + '|asd_cohorts'
    data['study'] = '10.1038/nature10989'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)
Ejemplo n.º 15
0
async def sanders_nature_de_novos(result, limiter):
    """ get de novo data from the Sanders et al autism exome study
    
    Supplementary table 2 (where the excel sheets for the probands and
    siblings have been combined) from:
    Sanders et al. (2012) Nature 485:237-241
    doi: 10.1038/nature10945
    
    Returns:
        data frame of de novos, with standardised genome coordinates and VEP
        consequences for each variant
    """
    logging.info('getting Sanders et al Nature 2012 de novos')
    probands = pandas.read_excel(url, sheet_name='Probands', engine='xlrd')
    siblings = pandas.read_excel(url, sheet_name='Siblings', engine='xlrd')
    data = probands.append(siblings, ignore_index=True)

    data['person_id'] = data['Child_ID'].astype(str)
    data['chrom'] = data['Chr'].str.replace('chr', '')
    data['pos'] = data['Pos (hg19)']
    data['ref'] = data['Ref']
    data['alt'] = data['Alt']
    data['build'] = 'grch37'

    # clean up the alleles
    data['ref'], data['alt'] = await fix_alleles(limiter, data)
    alleles = [fix_het_alleles(x.ref, x.alt) for i, x in data.iterrows()]
    data['ref'], data['alt'] = list(zip(*alleles))

    data['person_id'] += "|asd_cohorts"
    data['study'] = "10.1038/nature10945"
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)
Ejemplo n.º 16
0
async def de_rubeis_nature_de_novos(result):
    """ get de novo data from the 2014 De Rubeis et al. autism exome study in Nature
    
    De novo mutation data sourced from Supplementary table 3:
    De Rubeis et al. (2013) Nature 515:209-215
    doi: 10.1038/nature13772
    
    Returns:
        data frame of de novos, including gene symbol, functional consequence
         (VEP format), chromosome, nucleotide position
    """

    logging.info('getting De Rubeis et al Nature 2013 de novos')
    data = pandas.read_excel(url, sheet_name="De Novo", skipfooter=1)

    # rename columns to match the other de novo datasets
    data = data.rename(
        columns={
            'Chr': 'chrom',
            'Pos': 'pos',
            'Child_ID': 'person_id',
            'Ref': 'ref',
            'Alt': 'alt'
        })

    # strip whitespace and ensure columns are string
    for col in ['person_id', 'chrom', 'ref', 'alt']:
        data[col] = data[col].astype(str).str.replace(' |\t', '')

    data['person_id'] += '|asd_cohorts'
    data['study'] = "10.1038/nature13772"
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 17
0
async def iossifov_nature_de_novos(result):
    """ get de novo variants fromn Iossifov et al., Nature 2014
    
    Nature (2014) 515: 216-221, doi:10.1038/nature13908
    Variants sourced from Supplementary tables S2, with person IDs sourced from
    Table S1.
    """
    logging.info('getting Iossifov et al Nature 2014 de novos')
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    handle = ZipFile(temp.name)

    # obtain the dataframe of de novo variants
    data = pandas.read_excel(
        handle.open('nature13908-s2/Supplementary Table 2.xlsx'))
    fams = pandas.read_excel(
        handle.open('nature13908-s2/Supplementary Table 1.xlsx'))

    chrom, pos, ref, alt = fix_coordinates(data['location'],
                                           data['vcfVariant'])
    data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt

    sample_ids = get_sample_ids(fams)
    data['person_id'] = get_person_ids(data, sample_ids)
    data = tidy_families(data)

    data['person_id'] += '|asd_cohorts'
    data['study'] = "10.1038/nature13908"
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)
Ejemplo n.º 18
0
async def lelieveld_nn_de_novos(result, limiter):
    """ get de novo data for Lelieveld et al. intellectual disability exome study
    
    De novo mutation data sourced from supplementary table 2 from:
    Lelieveld et al. (2016) Nature Neuroscience 19:1194-1196
    doi: 10.1038/nn.4352
    
    Note that the paper says that the data were aligned to hg19, but their
    table of de novo data is definitely for hg18 (GRCh37).
    
    Returns:
        data frame of de novos, including gene symbol, functional consequence
        (VEP format), chromosome, nucleotide position and SNV or INDEL type
    """
    logging.info('getting Lelieveld et al Nature Neuroscience 2016 de novos')
    data = pandas.read_excel(url, sheet_name='Supplementary Table 2')

    data['person_id'] = data['Patient key'].astype(str)
    data['chrom'] = data['Chromosome'].str.replace('chr', '')
    data['pos'] = data['Start position']
    data['ref'] = data['Reference Allele']
    data['alt'] = data['Variant Allele']
    data['build'] = 'grch37'

    data['ref'], data['alt'] = await fix_alleles(limiter, data)

    data['person_id'] += '|lelieveld'
    data['study'] = '10.1038/nn.4352'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)
Ejemplo n.º 19
0
async def jin_nature_genetics_de_novos(result):
    """ gets individual level data for Jin et al congenital heart disease
    
    Supplementary Table 9 from:
    Jin et al. Nature Genetics 49: 1593-1601, doi: 10.1038/ng.3970
    """
    logging.info('getting Jin et al Nature Genetics 2017 de novos')
    data = pandas.read_excel(url, 'S9', skiprows=1)
    data['person_id'] = data['Blinded ID'].astype(str) + '|jin'
    
    data['chrom'] = data['CHROM'].astype(str)
    data['pos'] = data['POS']
    data['ref'] = data['REF']
    data['alt'] = data['ALT']
    data['study'] = '10.1038/ng.3970'
    data['confidence'] = 'high'
    
    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
            row.study, row.confidence, 'grch37')
        vars.add(var)
    
    result.append(vars)