def main(**kwargs): kwargs['hpo_db'] = get_hpo_from_json(kwargs['hpo_json']) # get patient_mini and patient_info kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file']) # if there are symbols, turn them into ensembl ids pheno = Pheno.main(**kwargs) kwargs['data'] = pheno kwargs['patient_map'] = pheno.pop('patient_map') kwargs['patients_variants'] = pheno.pop('patients_variants') # get hgf return get_hgf(**kwargs)
def main(**kwargs): kwargs['hpo_db'] = get_hpo_from_json(kwargs['hpo_json']) # get patient_mini and patient_info kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file']) # if there are symbols, turn them into ensembl ids pheno = Pheno.main(**kwargs) kwargs['data'] = pheno kwargs['patient_map'] = pheno.pop('patient_map') kwargs['patients_variants'] = pheno.pop('patients_variants') # get hgf hgf = get_hgf(**kwargs) # produce heatmaps if kwargs.get('heatmap_outdir',None) is not None: kwargs['hpos'] = {'r': hgf['result']['hgf']['r'].keys(), 'd': hgf['result']['hgf']['d'].keys()} draw_phenogenon(**kwargs) return hgf
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'N', 'gnomad_path', 'patient_mini_file', 'patient_info_file', 'unrelated_file', 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'hpo_mask', 'cadd_step', 'cadd_min', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # get patient_map patient_map = PM.main(**kwargs) if patient_map is None: return None modes = 'rd' # translate patient_map's key pm = {} for m in modes: pm[m] = {} for k, v in patient_map['patient_map'][m].items(): key = tuple([int(i) for i in k.split(',')]) pm[m][key] = v phenogenon_cache = {'r': {}, 'd': {}} # get phenogenon sums on the first gnomad bin. # get all testable hpos hpos = [ i for i, v in phs.items() if v >= kwargs['N'] and i not in kwargs['hpo_mask'] ] for hpo in hpos: # inheritance mode: r and d # Note that for each HPO, it only keeps the inheritance mode # with the higher hgf score for mode in modes: args = dict( hpos=hpo, mode=mode, patient_info=patient_info, patient_map=pm[mode], ) genon = helper.phenogenon(**args) phenogenon_cache[mode][hpo] = genon.tolist() return { 'phenogenon': phenogenon_cache, 'NP': patient_map['NP'], 'patient_map': patient_map['patient_map'], 'patients_variants': patient_map['patients_variants'], }
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'N', 'gnomad_path', 'patient_mini_file', 'patient_info_file', 'unrelated_file', 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'hpo_mask', 'cadd_step', 'cadd_min', 'output', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode',{}) # output already exist? if os.path.isfile(kwargs['output']): print('already done') return None # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # add cohort info into patient_mini all_p = MONGO['patient_db'].patients.find({'external_id':{'$in':patient_mini.keys()}},{'external_id':1,'contact':1}) for i in all_p: # !!!! this belongs to UCLex's problem!!! remove if publish # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell contactdict = dict( JingYu = 'UKIRDC', Black = 'UKIRDC', KELSELL = 'DavidKelsell', TonySegal = 'SEGAL', SanjaySisodiya = 'SISODIYA', ) contact = i['contact']['user_id'] contact = contactdict.get(contact,contact) patient_mini[i['external_id']] = {'hpo': patient_mini[i['external_id']], 'contact': contact} # get hpodb from json hpo_db = get_hpo_from_json('../tests/data/new-hpo-hpo.json') # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id':1, 'gene_name':1, '_id':0, 'chrom':1, 'start':1, 'stop':1, 'xstart':1, 'xstop':1, } this = {} gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db']) # get gnomad and cadd steps gnomad_steps = np.arange( 0, kwargs['gnomad_cutoff']+kwargs['gnomad_step'], kwargs['gnomad_step'] ) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # get patient_maps with open( os.path.join( kwargs['patient_maps_path'], '{}.json'.format(kwargs['chrom']) ), 'r', ) as inf: patient_maps = json.load(inf) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad result = {} number_processed = 0 coding_variants = None outf = open(kwargs['output'], 'w') outf.write('{') for gene_range in gene_ranges: # get patient_map patient_map = patient_maps.pop(gene_range['gene_id'], None) if patient_map is None: continue # print progress number_processed += 1 if not number_processed % 100: print('===processed {} genes==='.format(number_processed)) print('processing {}'.format(gene_range['gene_name'])) modes = kwargs['gene_inheritance_mode'].get( gene_range['gene_name'], kwargs['gene_inheritance_mode'].get( gene_range['gene_id'], 'rd' ) ) # translate patient_map's key pm = {} for m in modes: pm[m] = {} for k,v in patient_map['patient_map'][m].items(): key = tuple([int(i) for i in k.split(',')]) pm[m][key] = v phenogenon_cache = {'r':{},'d':{}} # get phenogenon sums on the first gnomad bin. # get all testable hpos hpos_to_test = None if kwargs['hpos_to_test']: hpos_to_test = [i for i in kwargs['hpos_to_test'] if i not in kwargs['hpo_mask']] else: hpos_to_test = [i for i,v in phs.items() if v >= kwargs['N'] and i not in kwargs['hpo_mask']] for hpos in hpos_to_test: # inheritance mode: r and d # Note that for each HPO, it only keeps the inheritance mode # with the higher hgf score for mode in modes: args = dict( hpos = hpos, mode = mode, patient_info = patient_info, patient_map = pm[mode], ) genon = helper.phenogenon(**args) phenogenon_cache[mode][hpos] = genon.tolist() output = json.dumps({ gene_range['gene_id']:{ 'symbol': gene_range['gene_name'], 'phenogenon': phenogenon_cache, 'NP': patient_map['NP'], } }) # strip off the braces output = output[1:-1] if number_processed != 1: # meaning not the first record. add a comma output = ',' + output outf.write(output) # close cursor gene_ranges.close() outf.write('}') outf.close()
def main(**kwargs): if not kwargs['output']: msg = 'Need to specify output' raise ValueError(msg) result = {} # get patient_mini and patient_info kwargs['patient_info'] = helper.get_snapshot(kwargs['patient_info_file']) if 'genes' in kwargs: # find gene_id and chrom genes = MONGO['phenopolis_db'].genes.find( {'gene_name': { '$in': kwargs['genes'] }}, { '_id': 0, 'gene_id': 1, 'chrom': 1 }) # aggregate on chrom chroms = defaultdict(list) for g in genes: chroms[g['chrom']].append(g['gene_id']) for chrom, genes in chroms.items(): infile = os.path.join( kwargs['phenogenon_path'], '{}.json'.format(chrom), ) try: with open(infile, 'r') as inf: data = json.load(inf) except IOError: continue # get patient_maps and patients_variants with open( os.path.join(kwargs['patient_maps_path'], '{}.json'.format(chrom)), 'r') as inf: pm = json.load(inf) with open( os.path.join(kwargs['patients_variants_path'], '{}.json'.format(chrom)), 'r') as inf: pv = json.load(inf) for gene_id in genes: # find patient_maps and patients_variants kwargs['patient_map'] = pm[gene_id] kwargs['patients_variants'] = pv[gene_id] # get phenogenon kwargs['data'] = data[gene_id] print(data[gene_id]['symbol']) result[gene_id] = get_phenogenon(**kwargs) print(result) else: infile = os.path.join(kwargs['phenogenon_path'], '{}.json'.format(kwargs['chrom'])) with open(infile, 'r') as inf: data = json.load(inf) # get patient_maps and patients_variants with open( os.path.join(kwargs['patient_maps_path'], '{}.json'.format(kwargs['chrom'])), 'r') as inf: pm = json.load(inf) with open( os.path.join(kwargs['patients_variants_path'], '{}.json'.format(kwargs['chrom'])), 'r') as inf: pv = json.load(inf) outf = open(kwargs['output'], 'w') outf.write('{') n = 0 for gene_id, value in data.items(): # find patient_maps and patients_variants kwargs['patient_map'] = pm[gene_id] kwargs['patients_variants'] = pv[gene_id] print(value['symbol']) kwargs['data'] = value output = json.dumps({gene_id: get_phenogenon(**kwargs)}) output = output[1:-1] if n > 0: # meaning not the first record. add a comma output = ',' + output outf.write(output) n += 1 #result[gene_id] = get_phenogenon(**kwargs) outf.write('}') outf.close()
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'remove_nc', 'vcf_file', 'gnomad_path', 'cadd_file', 'patient_mini_file', 'patient_info_file', 'human_fasta_ref', 'unrelated_file', 'v_cutoff', 'p_cutoff', # not using this since no phasing is done 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'cadd_step', 'cadd_min', 'genon_sum_cutoff_coefficient', 'cis_gap', 'output', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # add cohort info into patient_mini all_p = MONGO['patient_db'].patients.find( {'external_id': { '$in': patient_mini.keys() }}, { 'external_id': 1, 'contact': 1 }) for i in all_p: # !!!! this belongs to UCLex's problem!!! remove if publish # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell contactdict = dict( JingYu='UKIRDC', Black='UKIRDC', KELSELL='DavidKelsell', TonySegal='SEGAL', SanjaySisodiya='SISODIYA', ) contact = i['contact']['user_id'] contact = contactdict.get(contact, contact) patient_mini[i['external_id']] = { 'hpo': patient_mini[i['external_id']], 'contact': contact } # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id': 1, 'gene_name': 1, '_id': 0, 'chrom': 1, 'start': 1, 'stop': 1, 'xstart': 1, 'xstop': 1, } this = {} if kwargs.get('genes', None) is not None: genes = phenopolis_utils.symbols_to_ids(kwargs['genes'], MONGO['phenopolis_db']) this = {'gene_id': {'$in': genes}} # sometimes the cursor times out. # but do remember to close it if kwargs.get('chrom', None) is not None: gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db']) #gene_ranges = get_chrom_genes_with_jq(kwargs['chrom'],kwargs['uclex_genes_json']) else: gene_ranges = MONGO['phenopolis_db'].genes.find(this, fields, no_cursor_timeout=True) # get gnomad and cadd steps gnomad_steps = np.arange(0, kwargs['gnomad_cutoff'] + kwargs['gnomad_step'], kwargs['gnomad_step']) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad result = {} number_processed = 0 last_chrom = None coding_variants = None for gene_range in gene_ranges: # print progress number_processed += 1 if not number_processed % 100: print('===processed {} genes==='.format(number_processed)) print('processing {}'.format(gene_range['gene_name'])) # first parse vcf file to get genotype and coverage for each variant vcf_file = kwargs['vcf_file'].format(gene_range['chrom']) args = dict( vcf_file=vcf_file, chrom=gene_range['chrom'], start=gene_range['start'], stop=gene_range['stop'], unrelated_file=kwargs['unrelated_file'], human_fasta_ref=kwargs['human_fasta_ref'], v_cutoff=kwargs['v_cutoff'], p_cutoff=kwargs['p_cutoff'], gnomad_path=kwargs['gnomad_path'], gnomad_cutoff=kwargs['gnomad_cutoff'], patient_mini=patient_mini, ) vcf_dfs = get_vcf_df(**args) if vcf_dfs is None: # no variants, continue continue # get coding variants if last_chrom != this_chrom if kwargs['remove_nc'] and gene_range['chrom'] != last_chrom: coding_variant_file = '/cluster/project8/vyp/JingYu/git/phenopolis_analysis/data/public/vcf/chr{}.coding.tsv'.format( gene_range['chrom']) coding_variants = get_coding_variants(coding_variant_file) last_chrom = gene_range['chrom'] 'genon_sum_cutoff_coefficient', genotype_df, cover_df, gnomad_freqs = vcf_dfs # then get patients_variants, with variants annotated with # gnomad freqs and cadd args = dict( gnomad_freqs=gnomad_freqs, genotype_df=genotype_df, ) patients_variants = get_patients_variants(**args) # remove noncoding? if kwargs['remove_nc']: print(len(patients_variants['variants'])) helper.remove_noncoding(gene_range['gene_id'], patients_variants, coding_variants) print(len(patients_variants['variants'])) # if no variants left, skip if not patients_variants['variants']: continue # for each gene, remove batch-specific variants args = dict( data=patients_variants, patient_mini=patient_mini, ) batch_specific_variants = helper.get_batch_artefacts(**args) patients_variants = helper.remove_batch_artefacts( patients_variants, batch_specific_variants, patient_mini, ) # add cadd args = dict( variants=patients_variants['variants'], chrom=gene_range['chrom'], start=gene_range['start'], stop=gene_range['stop'], cadd_file=kwargs['cadd_file'], ) cadds = helper.add_cadd(**args) for k, v in cadds.items(): patients_variants['variants'][k]['cadd'] = v # when two variants are in cis and both appear in one patient, # discard the variant with lower cadd in that patient # example SDK1 ('7-3990565-C-T','7-4014039-C-T') # for now, only focus on variants with hom_f < 0.00025 remove_cis(patients_variants, genotype_df) # get patient_map for recessive and dominant modes args = dict( data=patients_variants, vcf=cover_df, gnomad_steps=gnomad_steps, cadd_steps=cadd_steps, cis_gap=kwargs['cis_gap'], ) patient_map = {'r': {}, 'd': {}} # first get mode if provided. Note that the keys could be id # or gene name modes = kwargs['gene_inheritance_mode'].get( gene_range['gene_name'], kwargs['gene_inheritance_mode'].get(gene_range['gene_id'], 'rd')) # get number of patients who carry rare variants when get patient_maps NP = {} for mode in modes: args['mode'] = mode M = helper.get_patient_map(**args) NP[mode] = len( set( list( itertools.chain.from_iterable( [v[0] for k, v in M.items() if k[1] == 0])))) # change the keys to a string for k, v in M.items(): patient_map[mode]['{},{}'.format( k[0], k[1])] = [list(v[0]), list(v[1])] result[gene_range['gene_id']] = { 'symbol': gene_range['gene_name'], 'patient_map': patient_map, 'NP': NP, } # close cursor gene_ranges.close() # write everything to output with open(kwargs['output'], 'wb') as outf: json.dump(result, outf)
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'remove_nc', 'vcf_file', 'gnomad_path', 'cadd_file', 'patient_mini_file', 'patient_info_file', 'human_fasta_ref', 'unrelated_file', 'v_cutoff', 'p_cutoff', # not using this since no phasing is done 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'cadd_step', 'cadd_min', 'cis_gap', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id': 1, 'gene_name': 1, '_id': 0, 'chrom': 1, 'start': 1, 'stop': 1, 'xstart': 1, 'xstop': 1, } this = {} # get gnomad and cadd steps gnomad_steps = np.arange(0, kwargs['gnomad_cutoff'] + kwargs['gnomad_step'], kwargs['gnomad_step']) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad # use PV to record patients_variants chrom, crange = kwargs['range'].split(':') start, stop = crange.split('-') # first parse vcf file to get genotype and coverage for each variant vcf_file = kwargs['vcf_file'].format(chrom) args = dict( vcf_file=vcf_file, chrom=chrom, start=start, stop=stop, unrelated_file=kwargs['unrelated_file'], human_fasta_ref=kwargs['human_fasta_ref'], v_cutoff=kwargs['v_cutoff'], p_cutoff=kwargs['p_cutoff'], gnomad_path=kwargs['gnomad_path'], gnomad_cutoff=kwargs['gnomad_cutoff'], patient_mini=patient_mini, ) vcf_dfs = get_vcf_df(**args) if vcf_dfs is None: # no variants, continue return None genotype_df, cover_df, gnomad_freqs = vcf_dfs # then get patients_variants, with variants annotated with # gnomad freqs and cadd args = dict( gnomad_freqs=gnomad_freqs, genotype_df=genotype_df, ) patients_variants = get_patients_variants(**args) # remove noncoding? if kwargs['remove_nc']: helper.remove_noncoding(patients_variants, kwargs) # if no variants left, skip if not patients_variants['variants']: return None # for each gene, remove batch-specific variants args = dict( data=patients_variants, patient_mini=patient_mini, ) batch_specific_variants = helper.get_batch_artefacts(**args) patients_variants = helper.remove_batch_artefacts( patients_variants, batch_specific_variants, patient_mini, ) # add cadd args = dict( variants=patients_variants['variants'], chrom=chrom, start=start, stop=stop, cadd_file=kwargs['cadd_file'], ) cadds = helper.add_cadd(**args) for k, v in cadds.items(): patients_variants['variants'][k]['cadd'] = v # when two variants are in cis and both appear in one patient, # discard the variant with lower cadd in that patient # example SDK1 ('7-3990565-C-T','7-4014039-C-T') # for now, only focus on variants with hom_f < 0.00025 remove_cis(patients_variants, genotype_df) patients_variants['cover_df'] = cover_df # output patients_variants return patients_variants