async def gilissen_nature_de_novos(result, limiter): """ load de novos from Gilissen et al Nature 2014 Nature 511: 344-347 2014, doi:10.1038/nature13394 Supplementary table S8. """ logging.info('getting Gilissen et al Nature 2014 de novos') temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data = clean_table(data) chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter, data.hgvs_genomic) data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt data['person_id'] += '|de_ligt' data['study'] = '10.1038/nature13394' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
async def homsy_science_de_novos(result): """ get de novo variants for Homsy et al Science 2015 Supplementary Database 1 from: Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396 """ logging.info('getting Homsy et al Science 2015 de novos') zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) with ZipFile(zipf.name) as zipped: handle = zipped.open('homsy_database_S02.xlsx') data = pandas.read_excel(handle, 'Database S2', skiprows=1) data['person_id'] = data['Blinded ID'].astype(str) data['person_id'] += '|homsy' data['chrom'] = data['CHROM'].astype(str) data['pos'] = data['POS'] data['ref'] = data['REF'] data['alt'] = data['ALT'] data['study'] = '10.1126/science.aac9396' data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)
def open_halldorsson_science_cohort(): """ get de novo variants for Halldorsson et al Science 2019 Supplementary Data 5 (revised) from: Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043 """ random.seed(1) with tempfile.NamedTemporaryFile() as temp: # the url redirects, so use the requests package to open the URL download_file(url, temp.name) df = pandas.read_table(temp.name, comment='#') df['person_id'] = df['Proband_id'].astype(str) df['person_id'] += '|halldorsson' phenotype = ['unaffected'] study = ['10.1126/science.aau1043'] female_fraction = 0.5 # assumption from the fraction from their earlier Jonsson et al publication persons = set() for row in df.itertuples(): sex = 'female' if random.random() < female_fraction else 'male' var = Person(row.person_id, sex, phenotype, study) persons.add(var) return persons
async def halldorsson_science_de_novos(result): """ get de novo variants for Halldorsson et al Science 2019 Supplementary Data 5 (revised) from: Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043 Halldorsson supercedes Jonsson et al, since at least 99.2% of the Jonsson et al samples occur in Halldorsson. See dnm_cohorts.halldorsson_check.py for more details. """ logging.info('getting Halldorsson et al Science 2019 de novos') with tempfile.NamedTemporaryFile() as temp: # the url redirects, so use the requests package to open the URL download_file(url, temp.name) df = pandas.read_table(temp.name, comment='#') df['person_id'] = df['Proband_id'].astype(str) df['person_id'] += '|halldorsson' df['chrom'] = df['Chr'].astype(str) df['pos'] = df['Pos'] df['ref'] = df['Ref'] df['alt'] = df['Alt'] df['study'] = '10.1126/science.aau1043' df['confidence'] = 'high' df['build'] = 'grch38' variants = set() for row in df.itertuples(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) variants.add(var) result.append(variants)
def open_jonsson_nature_cohort(): """ get cohort for Jonsson et al Nature 2017 Supplementary Table 4 from: Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018 """ random.seed(1) zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) # open the zipfile, then open a tarfile inside the zip, then extract and # read from file inside the tar path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz' with ZipFile( zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar: member = tar.getmember('decode_DNMs/decode_DNMs.tsv') data = pandas.read_table(tar.extractfile(member)) data['person_id'] = data['Proband_nr'].astype(str) data['person_id'] += '|jonsson' data['chrom'] = data['Chr'].astype('str') # remove individuals who were children of other probands child_ids = set(data.person_id[data.Phase_source == 'three_generation']) data = data[~data.person_id.isin(child_ids)] # we need to know which individuals are female. From the de novo table: # - 99% of chrX dnms have alt fractions between 0.3-0.75 # - we expect 5% of DNMs to occur on chrX, but only have half that at 2% # - only half (818 of 1548) of individuals have a chrX de novo call # These imply only females have chrX de novo calls. ChrX is 5% of the genome, # so we expect ~3.5 de novo calls on chrX per person. The chance of a person # having 0 chrX de novo calls is 3%, so the number of females should be ~3% # higher (818 - (818 / (1 - 0.03)) = 25). Of the remaining individuals, # each is 3.4% likely to be female (25 / (1548 - 818) = 0.0342) females = set(data.person_id[data.chrom == 'chrX']) missing_n = len(females) - (len(females) / (1 - 0.0301)) female_remainder = missing_n / (len(set(data.person_id)) - len(females)) phenotype = ['unaffected'] study = ['10.1038/nature24018'] persons = set() for row in data.itertuples(): # individuals have two chances to be female, 1) if their sample if is in # the female group, or 2) 3.4% of the remainder are female. sex = 'female' if row.person_id in females or random.random( ) < female_remainder else 'male' person = Person(row.person_id, sex, phenotype, study) persons.add(person) return persons
def open_homsy_science_cohort(): """ gets individual level data for Homsy et al congenital heart disease Supplementary Database 1 from: Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396 """ random.seed(1) zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) with ZipFile(zipf.name) as zipped: handle = zipped.open('homsy_database_S01.xlsx') data = pandas.read_excel(handle, 'Database S1', skiprows=1) data = data.drop(0, axis=0) data = data.rename( columns={ 'NDD determination if PCGC cohort': 'Developmental Delay', 'Unnamed: 6': 'Learning Disability', 'Unnamed: 7': 'Mental Retardation', 'Unnamed: 8': 'Autism Spectrum' }) data['person_id'] = data['Blinded ID'] data['person_id'] += '|homsy' study = ['10.1126/science.aac9396'] # estimate male fraction from proportion in Zaidi et al 2013, since the # sex isn't provided for individuals, nor the count of people per sex. male_fraction = 220 / (220 + 142) persons = set() for i, row in data.iterrows(): status = ['HP:0001627'] sex = 'male' if random.random() < male_fraction else 'female' if row['Developmental Delay'] == 'Yes': status.append('HP:0001263') if row['Mental Retardation'] == 'Yes': status.append('HP:0001249') if row['Autism Spectrum'] == 'Yes': status.append('HP:0000717') person = Person(row.person_id, sex, status, study) persons.add(person) return persons
async def jonsson_nature_de_novos(result): """ get de novo variants for Jonsson et al Nature 2017 This has been superceded by Haldorsson et al, since 99.2% of teh samples from Jonsson et al exist in Haldorsson et al. Supplementary Table 4 from: Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018 """ logging.info('getting Jonsson et al Nature 2017 de novos') zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) # open the zipfile, then open a tarfile inside the zip, then extract and # read from file inside the tar path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz' with ZipFile( zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar: member = tar.getmember('decode_DNMs/decode_DNMs.tsv') data = pandas.read_table(tar.extractfile(member)) data['person_id'] = data['Proband_nr'].astype(str) data['person_id'] += '|jonsson' data['chrom'] = data['Chr'].astype(str) data['pos'] = data['Pos_hg38'] data['ref'] = data['Ref'] data['alt'] = data['Alt'] data['study'] = '10.1038/nature24018' data['confidence'] = 'high' data['build'] = 'grch38' # remove individuals who were children of other probands child_ids = set(data.person_id[data.Phase_source == 'three_generation']) data = data[~data.person_id.isin(child_ids)] vars = set() for row in data.itertuples(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, row.build) vars.add(var) result.append(vars)
def open_epi4k_ajhg_cohort(): """ gets individual level data for Epi4K cohort Supplementary Table 6 from: Epi4K AJHG 95: 360-370, doi: 10.1016/j.ajhg.2014.08.013 """ temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data['person_id'] += '|epi4k' status = ['HP:0001250'] study = ['10.1016/j.ajhg.2014.08.013'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, status, study) persons.add(person) return persons
def open_rauch_cohort(): """ get person data for Rauch et al. intellectual disability exome study Rauch et al. (2012) Lancet 380:1674-1682 doi: 10.1016/S0140-6736(12)61480-9 Supplementary table 1 """ temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data['person_id'] += '|rauch' status = ['HP:0001249'] study = ['10.1016/S0140-6736(12)61480-9'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, status, study) persons.add(person) return persons
def open_iossifov_nature_cohort(): """ get proband details fromn Iossifov et al., Nature 2014 Nature (2014) 515: 216-221, doi:10.1038/nature13908 Supplementary table S1. """ tempdir = tempfile.TemporaryDirectory() zipf = os.path.join(tempdir.name, 'temp.zip') download_file(url, zipf) with ZipFile(zipf) as zipped: zipped.extractall(tempdir.name) path = os.path.join(tempdir.name, 'nature13908-s2', 'Supplementary Table 1.xlsx') data = pandas.read_excel(path, 'Supplement-T1-familiesTable') study = ['10.1038/nature13908'] persons = set() for i, row in data.iterrows(): fam = row.familyId for member in get_members(row): sex = row['probandGender'] if member[0] == 'p' else row[ 'siblingGender'] status = ['HP:0000717'] if member[0] == 'p' else ['unaffected'] if member[0] == 'p' and (row.probandVIQ < 70 or row.probandNVIQ < 70): status.append('HP:0001249') sex = 'male' if sex == 'M' else 'female' person_id = f'{fam}.{member}|asd_cohorts' person = Person(person_id, sex, status, study) persons.add(person) return persons
async def iossifov_nature_de_novos(result): """ get de novo variants fromn Iossifov et al., Nature 2014 Nature (2014) 515: 216-221, doi:10.1038/nature13908 Variants sourced from Supplementary tables S2, with person IDs sourced from Table S1. """ logging.info('getting Iossifov et al Nature 2014 de novos') temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) handle = ZipFile(temp.name) # obtain the dataframe of de novo variants data = pandas.read_excel( handle.open('nature13908-s2/Supplementary Table 2.xlsx')) fams = pandas.read_excel( handle.open('nature13908-s2/Supplementary Table 1.xlsx')) chrom, pos, ref, alt = fix_coordinates(data['location'], data['vcfVariant']) data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt sample_ids = get_sample_ids(fams) data['person_id'] = get_person_ids(data, sample_ids) data = tidy_families(data) data['person_id'] += '|asd_cohorts' data['study'] = "10.1038/nature13908" data['confidence'] = 'high' vars = set() for i, row in data.iterrows(): var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt, row.study, row.confidence, 'grch37') vars.add(var) result.append(vars)