def add_mock_probands(persons, required, prefix, suffix, phenotype, study): """ include mock probands for those without any de novos Args: persons: set of unique persons in the cohort required: number of required mock_probands prefix: prefix for mock sample IDs suffix: suffix for mock sample IDs phenotype: phenotype of probands (some studies include affected and unaffected). """ # ensure IDs and sexes are repeatable between runs by setting the random # seed with the first known person for each cohort. random.seed(str(min(persons))) affected = [x for x in persons if x.phenotype == phenotype] # use the current individuals to estimate the proportion of males, so we # can sample according to that fraction, to avoid changing the ratio. male_ratio = sum(x.sex == 'male' for x in affected) / len(affected) for x in range(required - len(affected)): person_id = f'{prefix}_{random_id()}|{suffix}' sex = 'male' if random.random() < male_ratio else 'female' person = Person(person_id, sex, phenotype, study) persons.add(person) return persons
def open_oroak_cohort(): """ get proband data from the O'Roak et al autism exome study O'Roak et al. (2012) Nature 485:246-250 doi: 10.1038/nature10989 Supplementary table 1 """ data = pandas.read_excel(url, sheet_name='Supplementary Table 1', skipfooter=1, engine='xlrd') study = ['10.1038/nature10989'] persons = set() for i, row in data.iterrows(): status = ['HP:0000717'] person_type = row.child.split('.')[1] # ignore the siblings, since they don't have any de novos recorded, so # don't contribute to the exome-sequence populations if person_type.startswith('s'): continue if row['non-verbal_IQ'] < 70: status.append('HP:0001249') person = Person(row.child + '|asd_cohorts', row.sex, status, study) persons.add(person) return persons
def open_sanders_neuron_cohort(): """ Supplementary Table 1 from: Sanders et al. (2015) Neuron 87:1215-1233 doi: 10.1016/j.neuron.2015.09.016 """ data = pandas.read_excel(url, sheet_name='Sheet1') sexes = {'F': 'female', 'female': 'female', 'M': 'male', 'male': 'male', 'U': 'unknown'} study = ['10.1016/j.neuron.2015.09.016'] persons = set() for i, row in data.iterrows(): if row.Father == '.' or row.Mother == '.': continue if row.Cohort == 'SSC_Removed': continue for sample in ['Proband', 'Sibling']: if row[sample] == '.': continue sex = sexes[row[f'{sample}Sex']] phenotype = ['unaffected'] if sample == 'Sibling' else ['HP:0000717'] person = Person(row[sample] + '|asd_cohorts', sex, phenotype, study) persons.add(person) return persons
def open_de_rubeis_cohort(): """ De Rubeis et al. (2013) Nature 515:209-215 doi: 10.1038/nature13772 Supplementary Table 3, with some additional proband details sourced from Supplementary table S5 from Sanders et al. (2015) Neuron 87:1215-1233. """ data = pandas.read_excel(url, sheet_name='De Novo', skipfooter=1) # clean up a couple of columns data['person_id'] = data.Child_ID data['sex'] = data.Child_Sex.map({1: 'male', 2: 'female'}) data['phenotype'] = data['Child_AffectedStatus'].map({1: ['unaffected'], 2: ['HP:0000717']}) data = data[['person_id', 'sex', 'phenotype']] additional = open_additional() data = data.append(additional, ignore_index=True) data['person_id'] = data.person_id.astype(str) data['person_id'] += '|asd_cohorts' study = ['10.1038/nature13772'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, row.phenotype, study) persons.add(person) persons = add_mock_probands(persons, 1445, 'asd', 'asd_cohorts', ['HP:0000717'], study) return persons
def open_jin_nature_genetics_cohort(): """ gets individual level data for Jin et al congenital heart disease Supplementary Table 1 from: Jin et al. Nature Genetics 49: 1593-1601, doi: 10.1038/ng.3970 """ random.seed(1) data = pandas.read_excel(url, 'S1', skiprows=1) data['person_id'] = data['Blinded ID'].astype(str) + '|jin' # get male fraction in trios from cohort sex counts in supplemental table 2 male_fraction = 1691 / (1691 + 1180) study = ['10.1038/ng.3970'] persons = set() for i, row in data.iterrows(): status = ['HP:0001627'] sex = 'male' if random.random() < male_fraction else 'female' if row['NDD'] == 'Yes': status.append('HP:0001263') person = Person(row.person_id, sex, status, study) persons.add(person) return persons
def open_iossifov_neuron_cohort(): """ get probands sequenced in Iossifov et al., Neuron 2012 Iossifov et al. (2012) Neuron 74:285-299 doi: 10.1016/j.neuron.2012.04.009 Data from supplementary tables S1, S2 and S3. """ s1 = pandas.read_excel(supp_s1_url, sheet_name='SNV.v4.1-normlized') s2 = pandas.read_excel(supp_s2_url, sheet_name='suppLGKTable') s3 = pandas.read_excel(supp_s3_url, sheet_name='ID.v4.1-normlized') fam_ids = list(s1.quadId) + list(s2.quadId) + list(s3.quadId) members = list(s1.inChild) + list(s2.inChild) + list(s3.inChild) sex = ['M', 'F'] affected = ['aut', 'sib'] possible = list(itertools.product(affected, sex)) study = ['10.1016/j.neuron.2012.04.009'] persons = set() for fam, children in zip(fam_ids, members): for affected, sex in possible: string = f'{affected}{sex}' if string in children: status = ['unaffected' ] if affected != 'aut' else ['HP:0000717'] member = 's1' if affected != 'aut' else 'p1' sex = 'female' if sex == 'F' else 'male' person_id = f'{fam}.{member}|asd_cohorts' person = Person(person_id, sex, status, study) persons.add(person) return persons
def open_halldorsson_science_cohort(): """ get de novo variants for Halldorsson et al Science 2019 Supplementary Data 5 (revised) from: Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043 """ random.seed(1) with tempfile.NamedTemporaryFile() as temp: # the url redirects, so use the requests package to open the URL download_file(url, temp.name) df = pandas.read_table(temp.name, comment='#') df['person_id'] = df['Proband_id'].astype(str) df['person_id'] += '|halldorsson' phenotype = ['unaffected'] study = ['10.1126/science.aau1043'] female_fraction = 0.5 # assumption from the fraction from their earlier Jonsson et al publication persons = set() for row in df.itertuples(): sex = 'female' if random.random() < female_fraction else 'male' var = Person(row.person_id, sex, phenotype, study) persons.add(var) return persons
def open_an_science_cohort(): """ gets individual level data for An et al Autism dataset Table S1 from: An et al. Science 362: eaat6576, doi: 10.1126/science.aat6576 """ with warnings.catch_warnings(): # suppress warning about unknown extension that doesn't affect loading data warnings.simplefilter("ignore") data = pandas.read_excel(url, sheet_name='Table S1 Sample information', skiprows=1, engine='openpyxl') data = data[['SampleID', 'FamilyID', 'Sex', 'Pheno', 'NVIQ']] study = ['10.1126/science.aat6576'] persons = set() for i, row in data.iterrows(): if row.SampleID.endswith('fa') or row.SampleID.endswith('mo'): # ignore parental samples continue status = ['unaffected'] if row.Pheno == 'control' else ['HP:0000717'] if isinstance(row.NVIQ, int) and row.NVIQ < 70: status.append('HP:0001249') person = Person(row.SampleID + '|asd_cohorts', row.Sex, status, study) persons.add(person) return persons
def open_cohort(path=None): if not path: path = COHORT_PATH with gzip.open(path, 'rt') as handle: header = handle.readline() cohort = [] for line in handle: person_id, sex, phenotypes, studies = line.strip('\n').split('\t') phenotypes = phenotypes.split(',') studies = studies.split(',') cohort.append(Person(person_id, sex, phenotypes, studies)) return cohort
def open_jonsson_nature_cohort(): """ get cohort for Jonsson et al Nature 2017 Supplementary Table 4 from: Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018 """ random.seed(1) zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) # open the zipfile, then open a tarfile inside the zip, then extract and # read from file inside the tar path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz' with ZipFile( zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar: member = tar.getmember('decode_DNMs/decode_DNMs.tsv') data = pandas.read_table(tar.extractfile(member)) data['person_id'] = data['Proband_nr'].astype(str) data['person_id'] += '|jonsson' data['chrom'] = data['Chr'].astype('str') # remove individuals who were children of other probands child_ids = set(data.person_id[data.Phase_source == 'three_generation']) data = data[~data.person_id.isin(child_ids)] # we need to know which individuals are female. From the de novo table: # - 99% of chrX dnms have alt fractions between 0.3-0.75 # - we expect 5% of DNMs to occur on chrX, but only have half that at 2% # - only half (818 of 1548) of individuals have a chrX de novo call # These imply only females have chrX de novo calls. ChrX is 5% of the genome, # so we expect ~3.5 de novo calls on chrX per person. The chance of a person # having 0 chrX de novo calls is 3%, so the number of females should be ~3% # higher (818 - (818 / (1 - 0.03)) = 25). Of the remaining individuals, # each is 3.4% likely to be female (25 / (1548 - 818) = 0.0342) females = set(data.person_id[data.chrom == 'chrX']) missing_n = len(females) - (len(females) / (1 - 0.0301)) female_remainder = missing_n / (len(set(data.person_id)) - len(females)) phenotype = ['unaffected'] study = ['10.1038/nature24018'] persons = set() for row in data.itertuples(): # individuals have two chances to be female, 1) if their sample if is in # the female group, or 2) 3.4% of the remainder are female. sex = 'female' if row.person_id in females or random.random( ) < female_remainder else 'male' person = Person(row.person_id, sex, phenotype, study) persons.add(person) return persons
def subcohort(rows, counts, prefix, suffix, study): ''' ''' phenotype = ['HP:0001249'] total = sum(counts.values()) male_fraction = counts['male'] / total persons = set() for i, row in rows.iterrows(): sex = 'male' if random.random() < male_fraction else 'female' person = Person(row['person_id'], sex, phenotype, study) persons.add(person) # account for individuals without exomic de novo mutations return add_mock_probands(persons, total, prefix, suffix, phenotype, study)
def open_homsy_science_cohort(): """ gets individual level data for Homsy et al congenital heart disease Supplementary Database 1 from: Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396 """ random.seed(1) zipf = tempfile.NamedTemporaryFile() download_file(url, zipf.name) with ZipFile(zipf.name) as zipped: handle = zipped.open('homsy_database_S01.xlsx') data = pandas.read_excel(handle, 'Database S1', skiprows=1) data = data.drop(0, axis=0) data = data.rename( columns={ 'NDD determination if PCGC cohort': 'Developmental Delay', 'Unnamed: 6': 'Learning Disability', 'Unnamed: 7': 'Mental Retardation', 'Unnamed: 8': 'Autism Spectrum' }) data['person_id'] = data['Blinded ID'] data['person_id'] += '|homsy' study = ['10.1126/science.aac9396'] # estimate male fraction from proportion in Zaidi et al 2013, since the # sex isn't provided for individuals, nor the count of people per sex. male_fraction = 220 / (220 + 142) persons = set() for i, row in data.iterrows(): status = ['HP:0001627'] sex = 'male' if random.random() < male_fraction else 'female' if row['Developmental Delay'] == 'Yes': status.append('HP:0001263') if row['Mental Retardation'] == 'Yes': status.append('HP:0001249') if row['Autism Spectrum'] == 'Yes': status.append('HP:0000717') person = Person(row.person_id, sex, status, study) persons.add(person) return persons
def open_mcrae_nature_cohort(): """ get proband details for McRae et al., Nature 2017 McRae et al Nature 2017 542:433-438 doi: 10.1038/nature21062 Supplementary table S1. """ data = pandas.read_excel(url, sheet_name='Supplementary Table 1') data['Individual ID'] += '|DDD' phenotype = ['HP:0001249'] study = ['10.1038/nature21062'] persons = set() for i, row in data.iterrows(): person = Person(row['Individual ID'], row.Sex, phenotype, study) persons.add(person) persons = add_mock_probands(persons, 4293, 'ddd', 'DDD', phenotype, study) return persons
def open_de_ligt_cohort(): """ get individuals from De Ligt et al., 2012 De Ligt et al., (2012) N Engl J Med 367:1921-1929 doi:10.1056/NEJMoa1206524 Proband details sourced from 'Clinical description of patients' section in supplementary material. """ temp = tempfile.NamedTemporaryFile() download_with_cookies(url, temp.name) data = extract_table(temp) data['person_id'] += '|de_ligt' status = ['HP:0001249'] study = ['10.1056/NEJMoa1206524'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, status, study) persons.add(person) return persons
def open_epi4k_ajhg_cohort(): """ gets individual level data for Epi4K cohort Supplementary Table 6 from: Epi4K AJHG 95: 360-370, doi: 10.1016/j.ajhg.2014.08.013 """ temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data['person_id'] += '|epi4k' status = ['HP:0001250'] study = ['10.1016/j.ajhg.2014.08.013'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, status, study) persons.add(person) return persons
def open_rauch_cohort(): """ get person data for Rauch et al. intellectual disability exome study Rauch et al. (2012) Lancet 380:1674-1682 doi: 10.1016/S0140-6736(12)61480-9 Supplementary table 1 """ temp = tempfile.NamedTemporaryFile() download_file(url, temp.name) data = extract_table(temp) data['person_id'] += '|rauch' status = ['HP:0001249'] study = ['10.1016/S0140-6736(12)61480-9'] persons = set() for i, row in data.iterrows(): person = Person(row.person_id, row.sex, status, study) persons.add(person) return persons
def open_iossifov_nature_cohort(): """ get proband details fromn Iossifov et al., Nature 2014 Nature (2014) 515: 216-221, doi:10.1038/nature13908 Supplementary table S1. """ tempdir = tempfile.TemporaryDirectory() zipf = os.path.join(tempdir.name, 'temp.zip') download_file(url, zipf) with ZipFile(zipf) as zipped: zipped.extractall(tempdir.name) path = os.path.join(tempdir.name, 'nature13908-s2', 'Supplementary Table 1.xlsx') data = pandas.read_excel(path, 'Supplement-T1-familiesTable') study = ['10.1038/nature13908'] persons = set() for i, row in data.iterrows(): fam = row.familyId for member in get_members(row): sex = row['probandGender'] if member[0] == 'p' else row[ 'siblingGender'] status = ['HP:0000717'] if member[0] == 'p' else ['unaffected'] if member[0] == 'p' and (row.probandVIQ < 70 or row.probandNVIQ < 70): status.append('HP:0001249') sex = 'male' if sex == 'M' else 'female' person_id = f'{fam}.{member}|asd_cohorts' person = Person(person_id, sex, status, study) persons.add(person) return persons
def open_lelieveld_cohort(): """ get proband details for Lelieveld et al., 2016 Lelieveld et al. (2016) Nature Neuroscience 19:1194-1196 doi: 10.1038/nn.4352 Supplementary table S2. """ random.seed(1) data = pandas.read_excel(url, sheet_name='Supplementary Table 2') phenotype = ['HP:0001249'] study = ['10.1038/nn.4352'] ids = list(range(1, max(data['Patient key']) + 1)) ids = [str(x) + '|lelieveld' for x in ids] male_fraction = 461 / (461 + 359) persons = set() for person_id in ids: sex = 'male' if random.random() < male_fraction else 'female' person = Person(person_id, sex, phenotype, study) persons.add(person) return persons
def open_sanders_nature_cohort(): """ load individuals form Sanders et al Nature 2012 cohort Sanders et al. (2012) Nature 485:237-241 doi: 10.1038/nature10945 Supplementary table S1 """ data = pandas.read_excel(url, sheet_name='Sheet1', engine='xlrd') study = ['10.1038/nature10945'] persons = set() for i, row in data.iterrows(): if row.Sample.endswith('fa') or row.Sample.endswith('mo'): # ignore parental samples continue status = ['HP:0000717'] if row.Role == 'Unaffected_Sibling': status = ['unaffected'] person = Person(row.Sample + '|asd_cohorts', row.Gender.lower(), status, study) persons.add(person) return persons