Esempio n. 1
0
def main(**kwargs):
    kwargs['hpo_db'] = get_hpo_from_json(kwargs['hpo_json'])
    # get patient_mini and patient_info
    kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file'])

    # if there are symbols, turn them into ensembl ids
    pheno = Pheno.main(**kwargs)
    kwargs['data'] = pheno
    kwargs['patient_map'] = pheno.pop('patient_map')
    kwargs['patients_variants'] = pheno.pop('patients_variants')
    # get hgf
    return get_hgf(**kwargs)
Esempio n. 2
0
def main(**kwargs):
    kwargs['hpo_db'] = get_hpo_from_json(kwargs['hpo_json'])
    # get patient_mini and patient_info
    kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file'])

    # if there are symbols, turn them into ensembl ids
    pheno = Pheno.main(**kwargs)
    kwargs['data'] = pheno
    kwargs['patient_map'] = pheno.pop('patient_map')
    kwargs['patients_variants'] = pheno.pop('patients_variants')
    # get hgf
    hgf = get_hgf(**kwargs)
    # produce heatmaps
    if kwargs.get('heatmap_outdir',None) is not None:
        kwargs['hpos'] = {'r': hgf['result']['hgf']['r'].keys(), 'd': hgf['result']['hgf']['d'].keys()}
        draw_phenogenon(**kwargs)
    return hgf
Esempio n. 3
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq
    '''
    # check args
    compulsory_keys = {
        'N',
        'gnomad_path',
        'patient_mini_file',
        'patient_info_file',
        'unrelated_file',
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'hpo_mask',
        'cadd_step',
        'cadd_min',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)

    # get patient_map
    patient_map = PM.main(**kwargs)
    if patient_map is None:
        return None

    modes = 'rd'
    # translate patient_map's key
    pm = {}
    for m in modes:
        pm[m] = {}
        for k, v in patient_map['patient_map'][m].items():
            key = tuple([int(i) for i in k.split(',')])
            pm[m][key] = v
    phenogenon_cache = {'r': {}, 'd': {}}
    # get phenogenon sums on the first gnomad bin.
    # get all testable hpos
    hpos = [
        i for i, v in phs.items()
        if v >= kwargs['N'] and i not in kwargs['hpo_mask']
    ]
    for hpo in hpos:
        # inheritance mode: r and d
        # Note that for each HPO, it only keeps the inheritance mode
        #  with the higher hgf score

        for mode in modes:
            args = dict(
                hpos=hpo,
                mode=mode,
                patient_info=patient_info,
                patient_map=pm[mode],
            )

            genon = helper.phenogenon(**args)

            phenogenon_cache[mode][hpo] = genon.tolist()

    return {
        'phenogenon': phenogenon_cache,
        'NP': patient_map['NP'],
        'patient_map': patient_map['patient_map'],
        'patients_variants': patient_map['patients_variants'],
    }
Esempio n. 4
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq 
    '''
    # check args
    compulsory_keys = {
        'N',
        'gnomad_path',
        'patient_mini_file',
        'patient_info_file',
        'unrelated_file',
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'hpo_mask',
        'cadd_step',
        'cadd_min',
        'output',
        }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode',{})
    # output already exist?
    if os.path.isfile(kwargs['output']):
        print('already done')
        return None
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # add cohort info into patient_mini
    all_p = MONGO['patient_db'].patients.find({'external_id':{'$in':patient_mini.keys()}},{'external_id':1,'contact':1})
    for i in all_p:
        # !!!! this belongs to UCLex's problem!!! remove if publish
        # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell
        contactdict = dict(
                JingYu = 'UKIRDC',
                Black = 'UKIRDC',
                KELSELL = 'DavidKelsell',
                TonySegal = 'SEGAL',
                SanjaySisodiya = 'SISODIYA',
                )
        contact = i['contact']['user_id']
        contact = contactdict.get(contact,contact)
        patient_mini[i['external_id']] = {'hpo': patient_mini[i['external_id']],
                                          'contact': contact}
    # get hpodb from json
    hpo_db = get_hpo_from_json('../tests/data/new-hpo-hpo.json')
    # get genes, if not provided. get all gene_ids from mongodb, \
            #if provided, convert to gene_id
    fields = {
            'gene_id':1,
            'gene_name':1,
            '_id':0,
            'chrom':1,
            'start':1,
            'stop':1,
            'xstart':1,
            'xstop':1,
            }
    this = {}

    gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db'])
    # get gnomad and cadd steps
    gnomad_steps = np.arange(
            0,
            kwargs['gnomad_cutoff']+kwargs['gnomad_step'],
            kwargs['gnomad_step']
            )
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # get patient_maps
    with open(
            os.path.join(
                kwargs['patient_maps_path'],
                '{}.json'.format(kwargs['chrom'])
            ),
            'r',
    ) as inf:
        patient_maps = json.load(inf)

    # for each gene, get all valid variants/patients according to p/v_cutoff, 
    # annotate using gnomad
    result = {}
    number_processed = 0
    coding_variants = None
    outf = open(kwargs['output'], 'w')
    outf.write('{')
    for gene_range in gene_ranges:
        # get patient_map
        patient_map = patient_maps.pop(gene_range['gene_id'], None)
        if patient_map is None:
            continue
        # print progress
        number_processed += 1
        if not number_processed % 100:
            print('===processed {} genes==='.format(number_processed))
        print('processing {}'.format(gene_range['gene_name']))

        modes = kwargs['gene_inheritance_mode'].get(
                gene_range['gene_name'],
                kwargs['gene_inheritance_mode'].get(
                    gene_range['gene_id'],
                    'rd'
                    )
                )

        # translate patient_map's key
        pm = {}
        for m in modes:
            pm[m] = {}
            for k,v in patient_map['patient_map'][m].items():
                key = tuple([int(i) for i in k.split(',')])
                pm[m][key] = v
        phenogenon_cache = {'r':{},'d':{}}
        # get phenogenon sums on the first gnomad bin.
        # get all testable hpos
        hpos_to_test = None
        if kwargs['hpos_to_test']:
            hpos_to_test = [i for i in kwargs['hpos_to_test']
                    if i not in kwargs['hpo_mask']]
        else:
            hpos_to_test = [i for i,v in phs.items() 
                    if v >= kwargs['N'] 
                    and i not in kwargs['hpo_mask']]
        for hpos in hpos_to_test:
            # inheritance mode: r and d
            # Note that for each HPO, it only keeps the inheritance mode
            #  with the higher hgf score

            for mode in modes:
                args = dict(
                        hpos = hpos,
                        mode = mode,
                        patient_info = patient_info,
                        patient_map = pm[mode],
                        )

                genon =  helper.phenogenon(**args)

                phenogenon_cache[mode][hpos] = genon.tolist()

        output = json.dumps({
            gene_range['gene_id']:{
                'symbol': gene_range['gene_name'],
                'phenogenon': phenogenon_cache,
                'NP': patient_map['NP'],
            }
        })
        # strip off the braces
        output = output[1:-1]
        if number_processed != 1:
            # meaning not the first record. add a comma
            output = ',' + output
        outf.write(output)

    # close cursor
    gene_ranges.close()

    outf.write('}')

    outf.close()
def main(**kwargs):
    if not kwargs['output']:
        msg = 'Need to specify output'
        raise ValueError(msg)
    result = {}
    # get patient_mini and patient_info
    kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file'])

    if 'genes' in kwargs:
        # find gene_id and chrom
        genes = MONGO['phenopolis_db'].genes.find(
            {'gene_name': {
                '$in': kwargs['genes']
            }}, {
                '_id': 0,
                'gene_id': 1,
                'chrom': 1
            })
        # aggregate on chrom
        chroms = defaultdict(list)
        for g in genes:
            chroms[g['chrom']].append(g['gene_id'])
        for chrom, genes in chroms.items():
            infile = os.path.join(
                kwargs['phenogenon_path'],
                '{}.json'.format(chrom),
            )
            try:
                with open(infile, 'r') as inf:
                    data = json.load(inf)
            except IOError:
                continue
            # get patient_maps and patients_variants
            with open(
                    os.path.join(kwargs['patient_maps_path'],
                                 '{}.json'.format(chrom)), 'r') as inf:
                pm = json.load(inf)
            with open(
                    os.path.join(kwargs['patients_variants_path'],
                                 '{}.json'.format(chrom)), 'r') as inf:
                pv = json.load(inf)

            for gene_id in genes:
                # find patient_maps and patients_variants
                kwargs['patient_map'] = pm[gene_id]
                kwargs['patients_variants'] = pv[gene_id]
                # get phenogenon
                kwargs['data'] = data[gene_id]
                print(data[gene_id]['symbol'])
                result[gene_id] = get_phenogenon(**kwargs)
        print(result)
    else:
        infile = os.path.join(kwargs['phenogenon_path'],
                              '{}.json'.format(kwargs['chrom']))
        with open(infile, 'r') as inf:
            data = json.load(inf)
        # get patient_maps and patients_variants
        with open(
                os.path.join(kwargs['patient_maps_path'],
                             '{}.json'.format(kwargs['chrom'])), 'r') as inf:
            pm = json.load(inf)
        with open(
                os.path.join(kwargs['patients_variants_path'],
                             '{}.json'.format(kwargs['chrom'])), 'r') as inf:
            pv = json.load(inf)

        outf = open(kwargs['output'], 'w')
        outf.write('{')
        n = 0
        for gene_id, value in data.items():
            # find patient_maps and patients_variants
            kwargs['patient_map'] = pm[gene_id]
            kwargs['patients_variants'] = pv[gene_id]
            print(value['symbol'])
            kwargs['data'] = value
            output = json.dumps({gene_id: get_phenogenon(**kwargs)})
            output = output[1:-1]
            if n > 0:
                # meaning not the first record. add a comma
                output = ',' + output
            outf.write(output)
            n += 1
            #result[gene_id] = get_phenogenon(**kwargs)

        outf.write('}')

        outf.close()
Esempio n. 6
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq 
    '''
    # check args
    compulsory_keys = {
        'remove_nc',
        'vcf_file',
        'gnomad_path',
        'cadd_file',
        'patient_mini_file',
        'patient_info_file',
        'human_fasta_ref',
        'unrelated_file',
        'v_cutoff',
        'p_cutoff',  # not using this since no phasing is done
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'cadd_step',
        'cadd_min',
        'genon_sum_cutoff_coefficient',
        'cis_gap',
        'output',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # add cohort info into patient_mini
    all_p = MONGO['patient_db'].patients.find(
        {'external_id': {
            '$in': patient_mini.keys()
        }}, {
            'external_id': 1,
            'contact': 1
        })
    for i in all_p:
        # !!!! this belongs to UCLex's problem!!! remove if publish
        # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell
        contactdict = dict(
            JingYu='UKIRDC',
            Black='UKIRDC',
            KELSELL='DavidKelsell',
            TonySegal='SEGAL',
            SanjaySisodiya='SISODIYA',
        )
        contact = i['contact']['user_id']
        contact = contactdict.get(contact, contact)
        patient_mini[i['external_id']] = {
            'hpo': patient_mini[i['external_id']],
            'contact': contact
        }
    # get genes, if not provided. get all gene_ids from mongodb, \
    #if provided, convert to gene_id
    fields = {
        'gene_id': 1,
        'gene_name': 1,
        '_id': 0,
        'chrom': 1,
        'start': 1,
        'stop': 1,
        'xstart': 1,
        'xstop': 1,
    }
    this = {}
    if kwargs.get('genes', None) is not None:
        genes = phenopolis_utils.symbols_to_ids(kwargs['genes'],
                                                MONGO['phenopolis_db'])
        this = {'gene_id': {'$in': genes}}

    # sometimes the cursor times out.
    # but do remember to close it
    if kwargs.get('chrom', None) is not None:
        gene_ranges = get_chrom_genes(kwargs['chrom'], fields,
                                      MONGO['phenopolis_db'])
        #gene_ranges = get_chrom_genes_with_jq(kwargs['chrom'],kwargs['uclex_genes_json'])
    else:
        gene_ranges = MONGO['phenopolis_db'].genes.find(this,
                                                        fields,
                                                        no_cursor_timeout=True)
    # get gnomad and cadd steps
    gnomad_steps = np.arange(0,
                             kwargs['gnomad_cutoff'] + kwargs['gnomad_step'],
                             kwargs['gnomad_step'])
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # for each gene, get all valid variants/patients according to p/v_cutoff,
    # annotate using gnomad
    result = {}
    number_processed = 0
    last_chrom = None
    coding_variants = None
    for gene_range in gene_ranges:
        # print progress
        number_processed += 1
        if not number_processed % 100:
            print('===processed {} genes==='.format(number_processed))
        print('processing {}'.format(gene_range['gene_name']))
        # first parse vcf file to get genotype and coverage for each variant
        vcf_file = kwargs['vcf_file'].format(gene_range['chrom'])
        args = dict(
            vcf_file=vcf_file,
            chrom=gene_range['chrom'],
            start=gene_range['start'],
            stop=gene_range['stop'],
            unrelated_file=kwargs['unrelated_file'],
            human_fasta_ref=kwargs['human_fasta_ref'],
            v_cutoff=kwargs['v_cutoff'],
            p_cutoff=kwargs['p_cutoff'],
            gnomad_path=kwargs['gnomad_path'],
            gnomad_cutoff=kwargs['gnomad_cutoff'],
            patient_mini=patient_mini,
        )
        vcf_dfs = get_vcf_df(**args)
        if vcf_dfs is None:
            # no variants, continue
            continue

        # get coding variants if last_chrom != this_chrom
        if kwargs['remove_nc'] and gene_range['chrom'] != last_chrom:
            coding_variant_file = '/cluster/project8/vyp/JingYu/git/phenopolis_analysis/data/public/vcf/chr{}.coding.tsv'.format(
                gene_range['chrom'])
            coding_variants = get_coding_variants(coding_variant_file)
        last_chrom = gene_range['chrom']
        'genon_sum_cutoff_coefficient',
        genotype_df, cover_df, gnomad_freqs = vcf_dfs
        # then get patients_variants, with variants annotated with
        #  gnomad freqs and cadd
        args = dict(
            gnomad_freqs=gnomad_freqs,
            genotype_df=genotype_df,
        )
        patients_variants = get_patients_variants(**args)
        # remove noncoding?
        if kwargs['remove_nc']:
            print(len(patients_variants['variants']))
            helper.remove_noncoding(gene_range['gene_id'], patients_variants,
                                    coding_variants)
            print(len(patients_variants['variants']))
        # if no variants left, skip
        if not patients_variants['variants']: continue
        # for each gene, remove batch-specific variants
        args = dict(
            data=patients_variants,
            patient_mini=patient_mini,
        )
        batch_specific_variants = helper.get_batch_artefacts(**args)
        patients_variants = helper.remove_batch_artefacts(
            patients_variants,
            batch_specific_variants,
            patient_mini,
        )
        # add cadd
        args = dict(
            variants=patients_variants['variants'],
            chrom=gene_range['chrom'],
            start=gene_range['start'],
            stop=gene_range['stop'],
            cadd_file=kwargs['cadd_file'],
        )
        cadds = helper.add_cadd(**args)
        for k, v in cadds.items():
            patients_variants['variants'][k]['cadd'] = v

        # when two variants are in cis and both appear in one patient,
        # discard the variant with lower cadd in that patient
        # example SDK1 ('7-3990565-C-T','7-4014039-C-T')
        # for now, only focus on variants with hom_f < 0.00025
        remove_cis(patients_variants, genotype_df)

        # get patient_map for recessive and dominant modes
        args = dict(
            data=patients_variants,
            vcf=cover_df,
            gnomad_steps=gnomad_steps,
            cadd_steps=cadd_steps,
            cis_gap=kwargs['cis_gap'],
        )
        patient_map = {'r': {}, 'd': {}}
        # first get mode if provided. Note that the keys could be id
        #  or gene name
        modes = kwargs['gene_inheritance_mode'].get(
            gene_range['gene_name'],
            kwargs['gene_inheritance_mode'].get(gene_range['gene_id'], 'rd'))

        # get number of patients who carry rare variants when get patient_maps
        NP = {}
        for mode in modes:
            args['mode'] = mode
            M = helper.get_patient_map(**args)
            NP[mode] = len(
                set(
                    list(
                        itertools.chain.from_iterable(
                            [v[0] for k, v in M.items() if k[1] == 0]))))
            # change the keys to a string
            for k, v in M.items():
                patient_map[mode]['{},{}'.format(
                    k[0], k[1])] = [list(v[0]), list(v[1])]

        result[gene_range['gene_id']] = {
            'symbol': gene_range['gene_name'],
            'patient_map': patient_map,
            'NP': NP,
        }
    # close cursor
    gene_ranges.close()

    # write everything to output
    with open(kwargs['output'], 'wb') as outf:
        json.dump(result, outf)
Esempio n. 7
0
def main(**kwargs):
    '''
    parameters:
     genes: optional
     N (selecting HPO with at least N Ph. affecting both \
          #positive (selecting parental HPO in the positive set \
          #and negative set)
     vcf file location
     gnomad files location
     patient_mini, patient_info, both are json files
     cadd path
     unrelated file used to subset vcf file
     v cutoff and p cutoff are to remove variants and patients with \
          #low coverage over the gene

    returns hpo goodness of fit score, p_g (gnomad_freq
    '''
    # check args
    compulsory_keys = {
        'remove_nc',
        'vcf_file',
        'gnomad_path',
        'cadd_file',
        'patient_mini_file',
        'patient_info_file',
        'human_fasta_ref',
        'unrelated_file',
        'v_cutoff',
        'p_cutoff',  # not using this since no phasing is done
        'gnomad_cutoff',
        'gnomad_step',
        'gnomad_path',
        'cadd_step',
        'cadd_min',
        'cis_gap',
    }
    helper.check_args(compulsory_keys, kwargs, 'main')
    # defaults
    kwargs.setdefault('gene_inheritance_mode', {})
    # get patient_mini and patient_info
    patient_info = helper.get_snapshot(kwargs['patient_info_file'])
    patient_mini = helper.get_snapshot(kwargs['patient_mini_file'])
    # get p_h for all hpos
    phs = helper.get_phs(patient_info)
    # get genes, if not provided. get all gene_ids from mongodb, \
    #if provided, convert to gene_id
    fields = {
        'gene_id': 1,
        'gene_name': 1,
        '_id': 0,
        'chrom': 1,
        'start': 1,
        'stop': 1,
        'xstart': 1,
        'xstop': 1,
    }
    this = {}
    # get gnomad and cadd steps
    gnomad_steps = np.arange(0,
                             kwargs['gnomad_cutoff'] + kwargs['gnomad_step'],
                             kwargs['gnomad_step'])
    cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step'])

    # for each gene, get all valid variants/patients according to p/v_cutoff,
    # annotate using gnomad
    # use PV to record patients_variants
    chrom, crange = kwargs['range'].split(':')
    start, stop = crange.split('-')
    # first parse vcf file to get genotype and coverage for each variant
    vcf_file = kwargs['vcf_file'].format(chrom)
    args = dict(
        vcf_file=vcf_file,
        chrom=chrom,
        start=start,
        stop=stop,
        unrelated_file=kwargs['unrelated_file'],
        human_fasta_ref=kwargs['human_fasta_ref'],
        v_cutoff=kwargs['v_cutoff'],
        p_cutoff=kwargs['p_cutoff'],
        gnomad_path=kwargs['gnomad_path'],
        gnomad_cutoff=kwargs['gnomad_cutoff'],
        patient_mini=patient_mini,
    )
    vcf_dfs = get_vcf_df(**args)
    if vcf_dfs is None:
        # no variants, continue
        return None

    genotype_df, cover_df, gnomad_freqs = vcf_dfs
    # then get patients_variants, with variants annotated with
    #  gnomad freqs and cadd
    args = dict(
        gnomad_freqs=gnomad_freqs,
        genotype_df=genotype_df,
    )
    patients_variants = get_patients_variants(**args)
    # remove noncoding?
    if kwargs['remove_nc']:
        helper.remove_noncoding(patients_variants, kwargs)
    # if no variants left, skip
    if not patients_variants['variants']:
        return None
    # for each gene, remove batch-specific variants
    args = dict(
        data=patients_variants,
        patient_mini=patient_mini,
    )
    batch_specific_variants = helper.get_batch_artefacts(**args)
    patients_variants = helper.remove_batch_artefacts(
        patients_variants,
        batch_specific_variants,
        patient_mini,
    )
    # add cadd
    args = dict(
        variants=patients_variants['variants'],
        chrom=chrom,
        start=start,
        stop=stop,
        cadd_file=kwargs['cadd_file'],
    )
    cadds = helper.add_cadd(**args)
    for k, v in cadds.items():
        patients_variants['variants'][k]['cadd'] = v

    # when two variants are in cis and both appear in one patient,
    # discard the variant with lower cadd in that patient
    # example SDK1 ('7-3990565-C-T','7-4014039-C-T')
    # for now, only focus on variants with hom_f < 0.00025
    remove_cis(patients_variants, genotype_df)
    patients_variants['cover_df'] = cover_df

    # output patients_variants
    return patients_variants