Python download_fileの例、dnm_cohorts.download_file.download_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: gilissen_nature.py プロジェクト: jeremymcrae/dnm_cohorts

async def gilissen_nature_de_novos(result, limiter):
    """ load de novos from Gilissen et al Nature 2014
    
    Nature 511: 344-347 2014, doi:10.1038/nature13394
    Supplementary table S8.
    """
    logging.info('getting Gilissen et al Nature 2014 de novos')
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    data = extract_table(temp)
    data = clean_table(data)

    chrom, pos, ref, alt = await fix_hgvs_coordinates(limiter,
                                                      data.hgvs_genomic)
    data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt

    data['person_id'] += '|de_ligt'
    data['study'] = '10.1038/nature13394'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)

コード例 #2

0

ファイルを表示

ファイル: homsy_science.py プロジェクト: jeremymcrae/dnm_cohorts

async def homsy_science_de_novos(result):
    """ get de novo variants for Homsy et al Science 2015
    
    Supplementary Database 1 from:
    Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396
    """
    logging.info('getting Homsy et al Science 2015 de novos')
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    with ZipFile(zipf.name) as zipped:
        handle = zipped.open('homsy_database_S02.xlsx')
        data = pandas.read_excel(handle, 'Database S2', skiprows=1)

    data['person_id'] = data['Blinded ID'].astype(str)
    data['person_id'] += '|homsy'
    data['chrom'] = data['CHROM'].astype(str)
    data['pos'] = data['POS']
    data['ref'] = data['REF']
    data['alt'] = data['ALT']
    data['study'] = '10.1126/science.aac9396'
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)

コード例 #3

0

ファイルを表示

ファイル: halldorsson_science.py プロジェクト: jeremymcrae/dnm_cohorts

def open_halldorsson_science_cohort():
    """ get de novo variants for Halldorsson et al Science 2019
    
    Supplementary Data 5 (revised) from:
    Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043
    """
    random.seed(1)

    with tempfile.NamedTemporaryFile() as temp:
        # the url redirects, so use the requests package to open the URL
        download_file(url, temp.name)
        df = pandas.read_table(temp.name, comment='#')

    df['person_id'] = df['Proband_id'].astype(str)
    df['person_id'] += '|halldorsson'

    phenotype = ['unaffected']
    study = ['10.1126/science.aau1043']
    female_fraction = 0.5  # assumption from the fraction from their earlier Jonsson et al publication

    persons = set()
    for row in df.itertuples():
        sex = 'female' if random.random() < female_fraction else 'male'
        var = Person(row.person_id, sex, phenotype, study)
        persons.add(var)

    return persons

コード例 #4

0

ファイルを表示

ファイル: halldorsson_science.py プロジェクト: jeremymcrae/dnm_cohorts

async def halldorsson_science_de_novos(result):
    """ get de novo variants for Halldorsson et al Science 2019
    
    Supplementary Data 5 (revised) from:
    Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043
    
    Halldorsson supercedes Jonsson et al, since at least 99.2% of the Jonsson et al
    samples occur in Halldorsson. See dnm_cohorts.halldorsson_check.py for more details.
    """
    logging.info('getting Halldorsson et al Science 2019 de novos')
    with tempfile.NamedTemporaryFile() as temp:
        # the url redirects, so use the requests package to open the URL
        download_file(url, temp.name)
        df = pandas.read_table(temp.name, comment='#')
    
    df['person_id'] = df['Proband_id'].astype(str)
    df['person_id'] += '|halldorsson'
    df['chrom'] = df['Chr'].astype(str)
    df['pos'] = df['Pos']
    df['ref'] = df['Ref']
    df['alt'] = df['Alt']
    df['study'] = '10.1126/science.aau1043'
    df['confidence'] = 'high'
    df['build'] = 'grch38'
    
    variants = set()
    for row in df.itertuples():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
            row.study, row.confidence, row.build)
        variants.add(var)
    
    result.append(variants)

コード例 #5

0

ファイルを表示

ファイル: jonsson_nature.py プロジェクト: jeremymcrae/dnm_cohorts

def open_jonsson_nature_cohort():
    """ get cohort for Jonsson et al Nature 2017
    
    Supplementary Table 4 from:
    Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018
    """
    random.seed(1)
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    # open the zipfile, then open a tarfile inside the zip, then extract and
    # read from file inside the tar
    path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz'
    with ZipFile(
            zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar:
        member = tar.getmember('decode_DNMs/decode_DNMs.tsv')
        data = pandas.read_table(tar.extractfile(member))

    data['person_id'] = data['Proband_nr'].astype(str)
    data['person_id'] += '|jonsson'
    data['chrom'] = data['Chr'].astype('str')

    # remove individuals who were children of other probands
    child_ids = set(data.person_id[data.Phase_source == 'three_generation'])
    data = data[~data.person_id.isin(child_ids)]

    # we need to know which individuals are female. From the de novo table:
    #   - 99% of chrX dnms have alt fractions between 0.3-0.75
    #   - we expect 5% of DNMs to occur on chrX, but only have half that at 2%
    #   - only half (818 of 1548) of individuals have a chrX de novo call
    # These imply only females have chrX de novo calls. ChrX is 5% of the genome,
    # so we expect ~3.5 de novo calls on chrX per person. The chance of a person
    # having 0 chrX de novo calls is 3%, so the number of females should be ~3%
    # higher (818 - (818 / (1 - 0.03)) = 25). Of the remaining individuals,
    # each is 3.4% likely to be female (25 / (1548 - 818) = 0.0342)
    females = set(data.person_id[data.chrom == 'chrX'])
    missing_n = len(females) - (len(females) / (1 - 0.0301))
    female_remainder = missing_n / (len(set(data.person_id)) - len(females))

    phenotype = ['unaffected']
    study = ['10.1038/nature24018']

    persons = set()
    for row in data.itertuples():
        # individuals have two chances to be female, 1) if their sample if is in
        # the female group, or 2) 3.4% of the remainder are female.
        sex = 'female' if row.person_id in females or random.random(
        ) < female_remainder else 'male'
        person = Person(row.person_id, sex, phenotype, study)
        persons.add(person)

    return persons

コード例 #6

0

ファイルを表示

ファイル: homsy_science.py プロジェクト: jeremymcrae/dnm_cohorts

def open_homsy_science_cohort():
    """ gets individual level data for Homsy et al congenital heart disease
    
    Supplementary Database 1 from:
    Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396
    """
    random.seed(1)
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    with ZipFile(zipf.name) as zipped:
        handle = zipped.open('homsy_database_S01.xlsx')
        data = pandas.read_excel(handle, 'Database S1', skiprows=1)

    data = data.drop(0, axis=0)
    data = data.rename(
        columns={
            'NDD determination if PCGC cohort': 'Developmental Delay',
            'Unnamed: 6': 'Learning Disability',
            'Unnamed: 7': 'Mental Retardation',
            'Unnamed: 8': 'Autism Spectrum'
        })

    data['person_id'] = data['Blinded ID']
    data['person_id'] += '|homsy'
    study = ['10.1126/science.aac9396']

    # estimate male fraction from proportion in Zaidi et al 2013, since the
    # sex isn't provided for individuals, nor the count of people per sex.
    male_fraction = 220 / (220 + 142)

    persons = set()
    for i, row in data.iterrows():
        status = ['HP:0001627']
        sex = 'male' if random.random() < male_fraction else 'female'
        if row['Developmental Delay'] == 'Yes':
            status.append('HP:0001263')
        if row['Mental Retardation'] == 'Yes':
            status.append('HP:0001249')
        if row['Autism Spectrum'] == 'Yes':
            status.append('HP:0000717')

        person = Person(row.person_id, sex, status, study)
        persons.add(person)

    return persons

コード例 #7

0

ファイルを表示

ファイル: jonsson_nature.py プロジェクト: jeremymcrae/dnm_cohorts

async def jonsson_nature_de_novos(result):
    """ get de novo variants for Jonsson et al Nature 2017
    
    This has been superceded by Haldorsson et al, since 99.2% of teh samples
    from Jonsson et al exist in Haldorsson et al.
    
    Supplementary Table 4 from:
    Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018
    """
    logging.info('getting Jonsson et al Nature 2017 de novos')
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    # open the zipfile, then open a tarfile inside the zip, then extract and
    # read from file inside the tar
    path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz'
    with ZipFile(
            zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar:
        member = tar.getmember('decode_DNMs/decode_DNMs.tsv')
        data = pandas.read_table(tar.extractfile(member))

    data['person_id'] = data['Proband_nr'].astype(str)
    data['person_id'] += '|jonsson'
    data['chrom'] = data['Chr'].astype(str)
    data['pos'] = data['Pos_hg38']
    data['ref'] = data['Ref']
    data['alt'] = data['Alt']
    data['study'] = '10.1038/nature24018'
    data['confidence'] = 'high'
    data['build'] = 'grch38'

    # remove individuals who were children of other probands
    child_ids = set(data.person_id[data.Phase_source == 'three_generation'])
    data = data[~data.person_id.isin(child_ids)]

    vars = set()
    for row in data.itertuples():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, row.build)
        vars.add(var)

    result.append(vars)

コード例 #8

0

ファイルを表示

ファイル: epi4k_ajhg.py プロジェクト: jeremymcrae/dnm_cohorts

def open_epi4k_ajhg_cohort():
    """ gets individual level data for Epi4K cohort
    
    Supplementary Table 6 from:
    Epi4K AJHG 95: 360-370, doi: 10.1016/j.ajhg.2014.08.013
    """

    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    data = extract_table(temp)

    data['person_id'] += '|epi4k'
    status = ['HP:0001250']
    study = ['10.1016/j.ajhg.2014.08.013']
    persons = set()
    for i, row in data.iterrows():

        person = Person(row.person_id, row.sex, status, study)
        persons.add(person)

    return persons

コード例 #9

0

ファイルを表示

ファイル: rauch_lancet.py プロジェクト: jeremymcrae/dnm_cohorts

def open_rauch_cohort():
    """ get person data for Rauch et al. intellectual disability exome study
    
     Rauch et al. (2012) Lancet 380:1674-1682
     doi: 10.1016/S0140-6736(12)61480-9
     Supplementary table 1
    """
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)
    
    data = extract_table(temp)
    data['person_id'] += '|rauch'
    
    status = ['HP:0001249']
    study = ['10.1016/S0140-6736(12)61480-9']
    persons = set()
    for i, row in data.iterrows():
        
        person = Person(row.person_id, row.sex, status, study)
        persons.add(person)
    
    return persons

コード例 #10

0

ファイルを表示

ファイル: iossifov_nature.py プロジェクト: jeremymcrae/dnm_cohorts

def open_iossifov_nature_cohort():
    """ get proband details fromn Iossifov et al., Nature 2014
    
    Nature (2014) 515: 216-221, doi:10.1038/nature13908
    Supplementary table S1.
    """

    tempdir = tempfile.TemporaryDirectory()
    zipf = os.path.join(tempdir.name, 'temp.zip')
    download_file(url, zipf)

    with ZipFile(zipf) as zipped:
        zipped.extractall(tempdir.name)

    path = os.path.join(tempdir.name, 'nature13908-s2',
                        'Supplementary Table 1.xlsx')
    data = pandas.read_excel(path, 'Supplement-T1-familiesTable')
    study = ['10.1038/nature13908']

    persons = set()
    for i, row in data.iterrows():

        fam = row.familyId
        for member in get_members(row):
            sex = row['probandGender'] if member[0] == 'p' else row[
                'siblingGender']

            status = ['HP:0000717'] if member[0] == 'p' else ['unaffected']
            if member[0] == 'p' and (row.probandVIQ < 70
                                     or row.probandNVIQ < 70):
                status.append('HP:0001249')
            sex = 'male' if sex == 'M' else 'female'
            person_id = f'{fam}.{member}|asd_cohorts'

            person = Person(person_id, sex, status, study)
            persons.add(person)

    return persons

コード例 #11

0

ファイルを表示

ファイル: iossifov_nature.py プロジェクト: jeremymcrae/dnm_cohorts

async def iossifov_nature_de_novos(result):
    """ get de novo variants fromn Iossifov et al., Nature 2014
    
    Nature (2014) 515: 216-221, doi:10.1038/nature13908
    Variants sourced from Supplementary tables S2, with person IDs sourced from
    Table S1.
    """
    logging.info('getting Iossifov et al Nature 2014 de novos')
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    handle = ZipFile(temp.name)

    # obtain the dataframe of de novo variants
    data = pandas.read_excel(
        handle.open('nature13908-s2/Supplementary Table 2.xlsx'))
    fams = pandas.read_excel(
        handle.open('nature13908-s2/Supplementary Table 1.xlsx'))

    chrom, pos, ref, alt = fix_coordinates(data['location'],
                                           data['vcfVariant'])
    data['chrom'], data['pos'], data['ref'], data['alt'] = chrom, pos, ref, alt

    sample_ids = get_sample_ids(fams)
    data['person_id'] = get_person_ids(data, sample_ids)
    data = tidy_families(data)

    data['person_id'] += '|asd_cohorts'
    data['study'] = "10.1038/nature13908"
    data['confidence'] = 'high'

    vars = set()
    for i, row in data.iterrows():
        var = DeNovo(row.person_id, row.chrom, row.pos, row.ref, row.alt,
                     row.study, row.confidence, 'grch37')
        vars.add(var)

    result.append(vars)