async def change_build(args): ''' shift variants onto a new genome build ''' yield args.input.readline() for line in args.input: var = DeNovo(*line.strip('\n').split('\t')) if not var: continue remapped = var.to_build(args.to) if remapped: yield str(remapped) + '\n'
async def an_science_de_novos(result): """ get de novo mutations from An et al, Autism dataset Table S2 from: An et al. Science 362: eaat6576, doi: 10.1126/science.aat6576 """ logging.info('getting An et al Science 2018 de novos') with warnings.catch_warnings(): # suppress warning about unknown extension that doesn't affect loading data warnings.simplefilter('ignore') data = pandas.read_excel(url, sheet_name='Table S2 de novo mutations', skiprows=1, usecols=list(range(8)), engine='openpyxl') data['chrom'] = data['Chr'].astype(str) data['SampleID'] += '|asd_cohorts' data['study'] = '10.1126/science.aat6576' data['confidence'] = 'high' vars = set() for row in data.itertuples(): var = DeNovo(row.SampleID, row.chrom, row.Pos, row.Ref, row.Alt, row.study, row.confidence, 'grch38') vars.add(var) result.append(vars)
async def halldorsson_science_de_novos(result): """ get de novo variants for Halldorsson et al Science 2019 Supplementary Data 5 (revised) from: Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043 Halldorsson supercedes Jonsson et al, since at least 99.2% of the Jonsson et al samples occur in Halldorsson. See dnm_cohorts.halldorsson_check.py for more details. """ logging.info('getting Halldorsson et al Science 2019 de novos') with tempfile.NamedTemporaryFile() as temp: # the url redirects, so use the requests package to open the URL download_file(url, temp.name) df = pandas.read_table(temp.name, comment='#') df['person_id'] = df['Proband_id'].astype(str) df['person_id'] += '|halldorsson' df['chrom'] = df['Chr'].astype(str) df['pos'] = df['Pos'] df['ref'] = df['Ref'] df['alt'] = df['Alt'] df['study'] = '10.1126/science.aau1043' df['confidence'] = 'high' df['build'] = 'grch38' variants = set() for row in df.itertuples(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) variants.add(var) result.append(variants)
async def homsy_science_de_novos(result): """ get de novo variants for Homsy et al Science 2015 Supplementary Database 1 from: Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396 """ logging.info('getting Homsy et al Science 2015 de novos') zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) with ZipFile(zipf.name) as zipped: handle = zipped.open('homsy_database_S02.xlsx') data = pandas.read_excel(handle, 'Database S2', skiprows=1) data['person_id'] = data['Blinded ID'].astype(str) data['person_id'] += '|homsy' data['chrom'] = data['CHROM'].astype(str) data['pos'] = data['POS'] data['ref'] = data['REF'] data['alt'] = data['ALT'] data['study'] = '10.1126/science.aac9396' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def epi4k_ajhg_de_novos(result, limiter): """ get de novo data for the Epi4K epilepsy exome study De novo mutation data from the most recent EPI4K publication: Supplementary table 1: American Journal of Human Genetics (2014) 95:360-370 doi: 10.1016/j.ajhg.2014.08.013 This incorporates the de novo mutation data from supplementary table 2 of: Allen et al. (2013) Nature 501:217-221 doi: 10.1038/nature12439 Returns: data.frame of de novo mutations """ logging.info('getting Epi4K et al AJHG 2014 de novos') data = pandas.read_excel(url, skipfooter=4) data['chrom'], data['pos'], data['ref'], data['alt'] = await fix_coordinates_with_allele(limiter, \ data['hg19 coordinates (chr:position)'], data["Ref/Alt alleles"]) data['study'] = "10.1016/j.ajhg.2014.08.013" data['person_id'] = get_person_ids(data) data['person_id'] += '|epi4k' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def mcrae_nature_de_novos(result): """ load de novo mutations from McRae et al Nature 2017 These de novos are loaded from Supplementary Table 1 from McRae et al Nature 2017 542:433-438 doi: 10.1038/nature21062 Returns: dataframe of de novo mutations """ logging.info('getting McRae et al Nature 2017 de novos') data = pandas.read_excel(url, sheet_name='Supplementary Table 1') data['person_id'] = data['Individual ID'] data['chrom'] = data['Chromosome'].astype(str) data['pos'] = data['Position (GRCh37)'] data['ref'] = data['Reference allele'] data['alt'] = data['Alternate allele'] data['person_id'] += '|DDD' data['study'] = '10.1038/nature21062' qual, status = data['PP(DNM)'], data['Status'] quality = qual.isnull() | (qual > 0.00781) | (status == 'validated') data['confidence'] = quality.map({True: 'high', False: 'low'}) vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def de_ligt_nejm_de_novos(result, limiter): """ get de novo mutations from De Ligt et al., 2012 De Ligt et al., (2012) N Engl J Med 367:1921-1929 doi:10.1056/NEJMoa1206524 Variants sourced from Supplementary Table S3. """ logging.info('getting De ligt et al NEJM 2012 de novos') temp = tempfile.NamedTemporaryFile() download_with_cookies(url, temp.name) data = extract_table(temp) data = clean_table(data) chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter, data.hgvs_genomic) data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt data['person_id'] += '|de_ligt' data['study'] = "10.1056/NEJMoa1206524" data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def sanders_neuron_de_novos(result): """ get de novo data from the Sanders et al Neuron autism exome study Supplementary table 5 from: Sanders et al. (2015) Neuron 87:1215-1233 doi: 10.1016/j.neuron.2015.09.016 Returns: data frame of de novos, with standardised genome coordinates and VEP consequences for each variant """ logging.info('getting Sanders et al Neuron 2012 de novos') data = pandas.read_excel(url, sheet_name='Exome') # remove some sibs with bad IDs. These sibs are not in the cohort table. remove = {'13930.s1', '12675.s1', '12707.s1', '11931.s1', '13867.s1', '14636.s1'} data = data[~data.patientID.isin(remove)] data['person_id'] = data['patientID'].astype(str) + '|asd_cohorts' data['chrom'] = data['Chr'].astype(str) data['pos'] = data['Pos(hg19)'] data['ref'] = data['Ref'] data['alt'] = data['Alt'] data['study'] = "10.1016/j.neuron.2015.09.016" quality = data['Confidence'] != 'lowConf' data['confidence'] = quality.map({True: 'high', False: 'low'}) vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def kaplanis_nature_de_novos(result, limiter): """ load de novo mutations from Kaplanis et al Nature 2020 These de novos are loaded from Supplementary Table 1 from Kaplanis et al Nature 2020 doi: 10.1038/s41586-020-2832-5 Returns: dataframe of de novo mutations """ logging.info('getting Kaplanis et al Nature 2019 de novos') data = pandas.read_table(url) data['person_id'] = data['id'] + '|' + data['study'] data['chrom'] = data['chrom'].astype(str) data['study'] = '10.1038/s41586-020-2832-5' data['confidence'] = 'high' data['build'] = 'grch37' # fix RUMC indels, as insertions lack ref alleles and deletions lack alts data['ref'], data['alt'] = await fix_alleles(limiter, data) vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
async def gilissen_nature_de_novos(result, limiter): """ load de novos from Gilissen et al Nature 2014 Nature 511: 344-347 2014, doi:10.1038/nature13394 Supplementary table S8. """ logging.info('getting Gilissen et al Nature 2014 de novos') temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data = clean_table(data) chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter, data.hgvs_genomic) data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt data['person_id'] += '|de_ligt' data['study'] = '10.1038/nature13394' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
def open_de_novos(path=None): ''' opens de novos, loads file from repo by default Pass 'grch37' or 'grch38' to open variants lifted to that build. ''' if not path: path = DE_NOVO_PATH elif isinstance(path, str) and path.lower() == 'grch37': path = DE_NOVO_PATH_b37 elif isinstance(path, str) and path.lower() == 'grch38': path = DE_NOVO_PATH_b38 with gzip.open(path, 'rt') as handle: header = handle.readline() return [DeNovo(*x.strip('\n').split('\t')) for x in handle]
async def iossifov_neuron_de_novos(result, limiter): """ get de novo data from the 2012 Iossifov et al autism exome study in Neuron Supplementary table 1 (where the non-coding SNVs have been excluded) and supplementary table 2 from: Iossifov et al. (2012) Neuron 74:285-299 doi: 10.1016/j.neuron.2012.04.009 Returns: data frame of de novos, with standardised genome coordinates and VEP consequences for each variant. """ logging.info('getting Iossifov et al Neuron 2012 de novos') snvs = pandas.read_excel(snv_url, sheet_name='SNV.v4.1-normlized') indels = pandas.read_excel(indel_url, sheet_name='ID.v4.1-normlized') # trim out the low quality de novos (as defined by a flag in the table) snvs = snvs[snvs['SNVFilter']] indels = indels[indels['IndelFilter']] # merge the SNV and indel de novo calls snvs = snvs[[ 'quadId', 'location', 'variant', 'effectGenes', 'effectType', 'inChild' ]] indels = indels[[ 'quadId', 'location', 'variant', 'effectGenes', 'effectType', 'inChild' ]] data = snvs.append(indels, ignore_index=True) # get the coordinates coords = await fix_coordinates_with_allele(limiter, data['location'], data['variant']) data['chrom'], data['pos'], data['ref'], data['alt'] = coords data['person_id'] = get_person_ids(data) data = tidy_families(data) data['person_id'] += '|asd_cohorts' data['study'] = '10.1016/j.neuron.2012.04.009' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def jonsson_nature_de_novos(result): """ get de novo variants for Jonsson et al Nature 2017 This has been superceded by Haldorsson et al, since 99.2% of teh samples from Jonsson et al exist in Haldorsson et al. Supplementary Table 4 from: Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018 """ logging.info('getting Jonsson et al Nature 2017 de novos') zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) # open the zipfile, then open a tarfile inside the zip, then extract and # read from file inside the tar path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz' with ZipFile( zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar: member = tar.getmember('decode_DNMs/decode_DNMs.tsv') data = pandas.read_table(tar.extractfile(member)) data['person_id'] = data['Proband_nr'].astype(str) data['person_id'] += '|jonsson' data['chrom'] = data['Chr'].astype(str) data['pos'] = data['Pos_hg38'] data['ref'] = data['Ref'] data['alt'] = data['Alt'] data['study'] = '10.1038/nature24018' data['confidence'] = 'high' data['build'] = 'grch38' # remove individuals who were children of other probands child_ids = set(data.person_id[data.Phase_source == 'three_generation']) data = data[~data.person_id.isin(child_ids)] vars = set() for row in data.itertuples(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
async def oroak_nature_de_novos(result, limiter): """ get de novo data from the O'Roak et al autism exome study Supplementary table 3 from: O'Roak et al. (2012) Nature 485:246-250 doi: 10.1038/nature10989 Returns: data frame of de novos, with standardised genome coordinates and VEP consequences for each variant """ logging.info('getting O\'Roak et al Nature 2012 de novos') data = pandas.read_excel(url, sheet_name="Supplementary Table 3", skipfooter=3, engine='xlrd') # standardise the chrom, position and allele column names data['chrom'] = data['Chromosome'].astype(str) data['pos'] = data['Position (hg19)'].astype(int) data['ref'] = data['Ref'] data['alt'] = data['Allele'] data['build'] = 'grch37' data['alt'] = tidy_complex_alts(data['alt']) data['ref'], data['alt'] = await fix_alleles(limiter, data) alleles = [fix_het_alleles(x.ref, x.alt) for i, x in data.iterrows()] data['ref'], data['alt'] = list(zip(*alleles)) data['person_id'] = data['Person'] + '|asd_cohorts' data['study'] = '10.1038/nature10989' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
async def sanders_nature_de_novos(result, limiter): """ get de novo data from the Sanders et al autism exome study Supplementary table 2 (where the excel sheets for the probands and siblings have been combined) from: Sanders et al. (2012) Nature 485:237-241 doi: 10.1038/nature10945 Returns: data frame of de novos, with standardised genome coordinates and VEP consequences for each variant """ logging.info('getting Sanders et al Nature 2012 de novos') probands = pandas.read_excel(url, sheet_name='Probands', engine='xlrd') siblings = pandas.read_excel(url, sheet_name='Siblings', engine='xlrd') data = probands.append(siblings, ignore_index=True) data['person_id'] = data['Child_ID'].astype(str) data['chrom'] = data['Chr'].str.replace('chr', '') data['pos'] = data['Pos (hg19)'] data['ref'] = data['Ref'] data['alt'] = data['Alt'] data['build'] = 'grch37' # clean up the alleles data['ref'], data['alt'] = await fix_alleles(limiter, data) alleles = [fix_het_alleles(x.ref, x.alt) for i, x in data.iterrows()] data['ref'], data['alt'] = list(zip(*alleles)) data['person_id'] += "|asd_cohorts" data['study'] = "10.1038/nature10945" data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
async def de_rubeis_nature_de_novos(result): """ get de novo data from the 2014 De Rubeis et al. autism exome study in Nature De novo mutation data sourced from Supplementary table 3: De Rubeis et al. (2013) Nature 515:209-215 doi: 10.1038/nature13772 Returns: data frame of de novos, including gene symbol, functional consequence (VEP format), chromosome, nucleotide position """ logging.info('getting De Rubeis et al Nature 2013 de novos') data = pandas.read_excel(url, sheet_name="De Novo", skipfooter=1) # rename columns to match the other de novo datasets data = data.rename( columns={ 'Chr': 'chrom', 'Pos': 'pos', 'Child_ID': 'person_id', 'Ref': 'ref', 'Alt': 'alt' }) # strip whitespace and ensure columns are string for col in ['person_id', 'chrom', 'ref', 'alt']: data[col] = data[col].astype(str).str.replace(' |\t', '') data['person_id'] += '|asd_cohorts' data['study'] = "10.1038/nature13772" data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def iossifov_nature_de_novos(result): """ get de novo variants fromn Iossifov et al., Nature 2014 Nature (2014) 515: 216-221, doi:10.1038/nature13908 Variants sourced from Supplementary tables S2, with person IDs sourced from Table S1. """ logging.info('getting Iossifov et al Nature 2014 de novos') temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) handle = ZipFile(temp.name) # obtain the dataframe of de novo variants data = pandas.read_excel( handle.open('nature13908-s2/Supplementary Table 2.xlsx')) fams = pandas.read_excel( handle.open('nature13908-s2/Supplementary Table 1.xlsx')) chrom, pos, ref, alt = fix_coordinates(data['location'], data['vcfVariant']) data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt sample_ids = get_sample_ids(fams) data['person_id'] = get_person_ids(data, sample_ids) data = tidy_families(data) data['person_id'] += '|asd_cohorts' data['study'] = "10.1038/nature13908" data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def lelieveld_nn_de_novos(result, limiter): """ get de novo data for Lelieveld et al. intellectual disability exome study De novo mutation data sourced from supplementary table 2 from: Lelieveld et al. (2016) Nature Neuroscience 19:1194-1196 doi: 10.1038/nn.4352 Note that the paper says that the data were aligned to hg19, but their table of de novo data is definitely for hg18 (GRCh37). Returns: data frame of de novos, including gene symbol, functional consequence (VEP format), chromosome, nucleotide position and SNV or INDEL type """ logging.info('getting Lelieveld et al Nature Neuroscience 2016 de novos') data = pandas.read_excel(url, sheet_name='Supplementary Table 2') data['person_id'] = data['Patient key'].astype(str) data['chrom'] = data['Chromosome'].str.replace('chr', '') data['pos'] = data['Start position'] data['ref'] = data['Reference Allele'] data['alt'] = data['Variant Allele'] data['build'] = 'grch37' data['ref'], data['alt'] = await fix_alleles(limiter, data) data['person_id'] += '|lelieveld' data['study'] = '10.1038/nn.4352' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
async def jin_nature_genetics_de_novos(result): """ gets individual level data for Jin et al congenital heart disease Supplementary Table 9 from: Jin et al. Nature Genetics 49: 1593-1601, doi: 10.1038/ng.3970 """ logging.info('getting Jin et al Nature Genetics 2017 de novos') data = pandas.read_excel(url, 'S9', skiprows=1) data['person_id'] = data['Blinded ID'].astype(str) + '|jin' data['chrom'] = data['CHROM'].astype(str) data['pos'] = data['POS'] data['ref'] = data['REF'] data['alt'] = data['ALT'] data['study'] = '10.1038/ng.3970' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)