def signup(): check_args(g.args, 'name', 'password') u = user.new_user( g.args['name'], g.args['password'], g.args) return {'message': 'signup successful', 'user': u}
def user_login(): """get a token to use for authentication throughout the rest of the site""" #NOTE: no permission required for this part because it uses an #alternative login method (username & password rather than token) #and declares the user object on its own #CONSIDER: add a delay for password based login to prevent excessive attempts check_args('username', 'password') g.user = user.auth(g.args['username'], g.args['password'], ip=request.remote_addr) if not g.user: raise ex.Unauthorized('Bad username or password.') return { 'message': 'login successful', 'token': g.user.token, }
def vote(update, context): logging.info('vote command called') context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) if check_args(update, context, "/vote", ["EVENT", "LOCATION"], operator.lt, len(context.args)): user = update.message.from_user message = gen_message.gen_vote(user.first_name, context.args) if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def get_patients_variants(**kwargs): ''' change genotype_df's data format into a dictionary, so gnomad and cadd can be added. return result { patients:{p:[v1,v2]} variants:{v1:{ gnomad_af: gnomad_hom_f: cadd: }} } ''' # define result result = {'patients': defaultdict(list), 'variants': {}} compulsory_keys = { 'genotype_df', 'gnomad_freqs', } # check args helper.check_args(compulsory_keys, kwargs, 'get_patients_variants') # change the data structure to the result format for col in list(kwargs['genotype_df']): this = kwargs['genotype_df'][col] # there might be a way to merge the following two steps into one # + het result['patients'][col].extend(this[this == 1].index) # + hom result['patients'][col].extend(list(this[this == 2].index) * 2) result['variants'] = { i: { 'gnomad_af': kwargs['gnomad_freqs'][i]['gnomad_af'], 'gnomad_an': kwargs['gnomad_freqs'][i]['gnomad_an'], 'gnomad_ac': kwargs['gnomad_freqs'][i]['gnomad_ac'], 'gnomad_hom_c': kwargs['gnomad_freqs'][i]['gnomad_hom_c'], 'gnomad_hom_f': kwargs['gnomad_freqs'][i]['gnomad_hom_f'], 'cadd': None, } for i in kwargs['genotype_df'].index } return result
def training(update, context): logging.info('training command called') context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) if check_args(update, context, "/training", ["[LOCATION]"], operator.lt, len(context.args)): user = update.message.from_user message = gen_message.gen_training(user.first_name, context.args) keyboard = [[InlineKeyboardButton("Join", callback_data="train" + str(user.first_name))]] reply_markup = InlineKeyboardMarkup(keyboard) if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message, reply_markup=reply_markup)
def notraining(update, context): logging.info('notraining command called') context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) user = update.message.from_user message = "" if len(context.args) == 0: message = gen_message.gen_notraining_0(user.first_name) elif check_args(update, context, "/notraining", ["TIME", "LOCATION"], operator.lt, len(context.args)): message = gen_message.gen_notraining(user.first_name, context.args) if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def rm_vote(update, context): context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) logging.info('removevote command called') user = update.message.from_user message = "" if len(context.args) == 0: message = gen_message.gen_rm_vote_0(user.first_name) elif check_args(update, context, "/removevote", ["EVENT"], operator.lt, len(context.args)): message = gen_message.gen_rm_vote(user.first_name, context.args) if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def notomorrowtraining(update, context): logging.info('notomorrowtraining command called') context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) user = update.message.from_user message = "" if len(context.args) == 0: message = gen_message.gen_notraining_0(user.first_name, day=f"{date.today() + timedelta(days=1)}.train", daystring=" tomorrow") elif check_args(update, context, "/notomorrowtraining", ["TIME", "LOCATION"], operator.lt, len(context.args)): message = gen_message.gen_notraining(user.first_name, context.args, day=f"{date.today() + timedelta(days=1)}.train", daystring="tomorrow ") if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message)
def tomorrowtraining(update, context): logging.info('tomorrowtraining command called') context.bot.deleteMessage(chat_id=update.effective_chat.id, message_id=update.message.message_id) if check_args(update, context, "/tomorrowtraining", ["TIME", "LOCATION"], operator.lt, len(context.args)): user = update.message.from_user reply_markup = None message = gen_message.gen_training(user.first_name, context.args, day=f"{date.today() + timedelta(days=1)}.train", daystring="tomorrow ") if message != "Please specify time in the format HH:MM": keyboard = [[InlineKeyboardButton("Join", callback_data="train" + str(user.first_name))]] reply_markup = InlineKeyboardMarkup(keyboard) if message: context.bot.send_message(chat_id=update.effective_chat.id, text=message, reply_markup=reply_markup)
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'N', 'gnomad_path', 'patient_mini_file', 'patient_info_file', 'unrelated_file', 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'hpo_mask', 'cadd_step', 'cadd_min', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # get patient_map patient_map = PM.main(**kwargs) if patient_map is None: return None modes = 'rd' # translate patient_map's key pm = {} for m in modes: pm[m] = {} for k, v in patient_map['patient_map'][m].items(): key = tuple([int(i) for i in k.split(',')]) pm[m][key] = v phenogenon_cache = {'r': {}, 'd': {}} # get phenogenon sums on the first gnomad bin. # get all testable hpos hpos = [ i for i, v in phs.items() if v >= kwargs['N'] and i not in kwargs['hpo_mask'] ] for hpo in hpos: # inheritance mode: r and d # Note that for each HPO, it only keeps the inheritance mode # with the higher hgf score for mode in modes: args = dict( hpos=hpo, mode=mode, patient_info=patient_info, patient_map=pm[mode], ) genon = helper.phenogenon(**args) phenogenon_cache[mode][hpo] = genon.tolist() return { 'phenogenon': phenogenon_cache, 'NP': patient_map['NP'], 'patient_map': patient_map['patient_map'], 'patients_variants': patient_map['patients_variants'], }
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'remove_nc', 'vcf_file', 'gnomad_path', 'cadd_file', 'patient_mini_file', 'patient_info_file', 'human_fasta_ref', 'unrelated_file', 'v_cutoff', 'p_cutoff', # not using this since no phasing is done 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'cadd_step', 'cadd_min', 'cis_gap', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info gnomad_steps = np.arange(0, kwargs['gnomad_cutoff'] + kwargs['gnomad_step'], kwargs['gnomad_step']) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad # then get patients_variants, with variants annotated with # gnomad freqs and cadd patients_variants = PV.main(**kwargs) # get patient_map for recessive and dominant modes args = dict( data=patients_variants, vcf=patients_variants['cover_df'], gnomad_steps=gnomad_steps, cadd_steps=cadd_steps, cis_gap=kwargs['cis_gap'], ) patient_map = {'r': {}, 'd': {}} # first get mode if provided. Note that the keys could be id # or gene name modes = 'rd' # get number of patients who carry rare variants when get patient_maps NP = {} for mode in modes: args['mode'] = mode M = helper.get_patient_map(**args) NP[mode] = len( set( list( itertools.chain.from_iterable( [v[0] for k, v in M.items() if k[1] == 0])))) # change the keys to a string for k, v in M.items(): patient_map[mode]['{},{}'.format(k[0], k[1])] = [list(v[0]), list(v[1])] return { 'patient_map': patient_map, 'patients_variants': patients_variants, 'NP': NP, }
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'N', 'gnomad_path', 'patient_mini_file', 'patient_info_file', 'unrelated_file', 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'hpo_mask', 'cadd_step', 'cadd_min', 'output', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode',{}) # output already exist? if os.path.isfile(kwargs['output']): print('already done') return None # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # add cohort info into patient_mini all_p = MONGO['patient_db'].patients.find({'external_id':{'$in':patient_mini.keys()}},{'external_id':1,'contact':1}) for i in all_p: # !!!! this belongs to UCLex's problem!!! remove if publish # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell contactdict = dict( JingYu = 'UKIRDC', Black = 'UKIRDC', KELSELL = 'DavidKelsell', TonySegal = 'SEGAL', SanjaySisodiya = 'SISODIYA', ) contact = i['contact']['user_id'] contact = contactdict.get(contact,contact) patient_mini[i['external_id']] = {'hpo': patient_mini[i['external_id']], 'contact': contact} # get hpodb from json hpo_db = get_hpo_from_json('../tests/data/new-hpo-hpo.json') # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id':1, 'gene_name':1, '_id':0, 'chrom':1, 'start':1, 'stop':1, 'xstart':1, 'xstop':1, } this = {} gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db']) # get gnomad and cadd steps gnomad_steps = np.arange( 0, kwargs['gnomad_cutoff']+kwargs['gnomad_step'], kwargs['gnomad_step'] ) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # get patient_maps with open( os.path.join( kwargs['patient_maps_path'], '{}.json'.format(kwargs['chrom']) ), 'r', ) as inf: patient_maps = json.load(inf) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad result = {} number_processed = 0 coding_variants = None outf = open(kwargs['output'], 'w') outf.write('{') for gene_range in gene_ranges: # get patient_map patient_map = patient_maps.pop(gene_range['gene_id'], None) if patient_map is None: continue # print progress number_processed += 1 if not number_processed % 100: print('===processed {} genes==='.format(number_processed)) print('processing {}'.format(gene_range['gene_name'])) modes = kwargs['gene_inheritance_mode'].get( gene_range['gene_name'], kwargs['gene_inheritance_mode'].get( gene_range['gene_id'], 'rd' ) ) # translate patient_map's key pm = {} for m in modes: pm[m] = {} for k,v in patient_map['patient_map'][m].items(): key = tuple([int(i) for i in k.split(',')]) pm[m][key] = v phenogenon_cache = {'r':{},'d':{}} # get phenogenon sums on the first gnomad bin. # get all testable hpos hpos_to_test = None if kwargs['hpos_to_test']: hpos_to_test = [i for i in kwargs['hpos_to_test'] if i not in kwargs['hpo_mask']] else: hpos_to_test = [i for i,v in phs.items() if v >= kwargs['N'] and i not in kwargs['hpo_mask']] for hpos in hpos_to_test: # inheritance mode: r and d # Note that for each HPO, it only keeps the inheritance mode # with the higher hgf score for mode in modes: args = dict( hpos = hpos, mode = mode, patient_info = patient_info, patient_map = pm[mode], ) genon = helper.phenogenon(**args) phenogenon_cache[mode][hpos] = genon.tolist() output = json.dumps({ gene_range['gene_id']:{ 'symbol': gene_range['gene_name'], 'phenogenon': phenogenon_cache, 'NP': patient_map['NP'], } }) # strip off the braces output = output[1:-1] if number_processed != 1: # meaning not the first record. add a comma output = ',' + output outf.write(output) # close cursor gene_ranges.close() outf.write('}') outf.close()
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'remove_nc', 'vcf_file', 'gnomad_path', 'cadd_file', 'patient_mini_file', 'patient_info_file', 'human_fasta_ref', 'unrelated_file', 'v_cutoff', 'p_cutoff', # not using this since no phasing is done 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'cadd_step', 'cadd_min', 'genon_sum_cutoff_coefficient', 'cis_gap', 'output', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # add cohort info into patient_mini all_p = MONGO['patient_db'].patients.find( {'external_id': { '$in': patient_mini.keys() }}, { 'external_id': 1, 'contact': 1 }) for i in all_p: # !!!! this belongs to UCLex's problem!!! remove if publish # JingYu and BLACK to UKIRDC, KELSELL to DavidKelsell contactdict = dict( JingYu='UKIRDC', Black='UKIRDC', KELSELL='DavidKelsell', TonySegal='SEGAL', SanjaySisodiya='SISODIYA', ) contact = i['contact']['user_id'] contact = contactdict.get(contact, contact) patient_mini[i['external_id']] = { 'hpo': patient_mini[i['external_id']], 'contact': contact } # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id': 1, 'gene_name': 1, '_id': 0, 'chrom': 1, 'start': 1, 'stop': 1, 'xstart': 1, 'xstop': 1, } this = {} if kwargs.get('genes', None) is not None: genes = phenopolis_utils.symbols_to_ids(kwargs['genes'], MONGO['phenopolis_db']) this = {'gene_id': {'$in': genes}} # sometimes the cursor times out. # but do remember to close it if kwargs.get('chrom', None) is not None: gene_ranges = get_chrom_genes(kwargs['chrom'], fields, MONGO['phenopolis_db']) #gene_ranges = get_chrom_genes_with_jq(kwargs['chrom'],kwargs['uclex_genes_json']) else: gene_ranges = MONGO['phenopolis_db'].genes.find(this, fields, no_cursor_timeout=True) # get gnomad and cadd steps gnomad_steps = np.arange(0, kwargs['gnomad_cutoff'] + kwargs['gnomad_step'], kwargs['gnomad_step']) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad result = {} number_processed = 0 last_chrom = None coding_variants = None for gene_range in gene_ranges: # print progress number_processed += 1 if not number_processed % 100: print('===processed {} genes==='.format(number_processed)) print('processing {}'.format(gene_range['gene_name'])) # first parse vcf file to get genotype and coverage for each variant vcf_file = kwargs['vcf_file'].format(gene_range['chrom']) args = dict( vcf_file=vcf_file, chrom=gene_range['chrom'], start=gene_range['start'], stop=gene_range['stop'], unrelated_file=kwargs['unrelated_file'], human_fasta_ref=kwargs['human_fasta_ref'], v_cutoff=kwargs['v_cutoff'], p_cutoff=kwargs['p_cutoff'], gnomad_path=kwargs['gnomad_path'], gnomad_cutoff=kwargs['gnomad_cutoff'], patient_mini=patient_mini, ) vcf_dfs = get_vcf_df(**args) if vcf_dfs is None: # no variants, continue continue # get coding variants if last_chrom != this_chrom if kwargs['remove_nc'] and gene_range['chrom'] != last_chrom: coding_variant_file = '/cluster/project8/vyp/JingYu/git/phenopolis_analysis/data/public/vcf/chr{}.coding.tsv'.format( gene_range['chrom']) coding_variants = get_coding_variants(coding_variant_file) last_chrom = gene_range['chrom'] 'genon_sum_cutoff_coefficient', genotype_df, cover_df, gnomad_freqs = vcf_dfs # then get patients_variants, with variants annotated with # gnomad freqs and cadd args = dict( gnomad_freqs=gnomad_freqs, genotype_df=genotype_df, ) patients_variants = get_patients_variants(**args) # remove noncoding? if kwargs['remove_nc']: print(len(patients_variants['variants'])) helper.remove_noncoding(gene_range['gene_id'], patients_variants, coding_variants) print(len(patients_variants['variants'])) # if no variants left, skip if not patients_variants['variants']: continue # for each gene, remove batch-specific variants args = dict( data=patients_variants, patient_mini=patient_mini, ) batch_specific_variants = helper.get_batch_artefacts(**args) patients_variants = helper.remove_batch_artefacts( patients_variants, batch_specific_variants, patient_mini, ) # add cadd args = dict( variants=patients_variants['variants'], chrom=gene_range['chrom'], start=gene_range['start'], stop=gene_range['stop'], cadd_file=kwargs['cadd_file'], ) cadds = helper.add_cadd(**args) for k, v in cadds.items(): patients_variants['variants'][k]['cadd'] = v # when two variants are in cis and both appear in one patient, # discard the variant with lower cadd in that patient # example SDK1 ('7-3990565-C-T','7-4014039-C-T') # for now, only focus on variants with hom_f < 0.00025 remove_cis(patients_variants, genotype_df) # get patient_map for recessive and dominant modes args = dict( data=patients_variants, vcf=cover_df, gnomad_steps=gnomad_steps, cadd_steps=cadd_steps, cis_gap=kwargs['cis_gap'], ) patient_map = {'r': {}, 'd': {}} # first get mode if provided. Note that the keys could be id # or gene name modes = kwargs['gene_inheritance_mode'].get( gene_range['gene_name'], kwargs['gene_inheritance_mode'].get(gene_range['gene_id'], 'rd')) # get number of patients who carry rare variants when get patient_maps NP = {} for mode in modes: args['mode'] = mode M = helper.get_patient_map(**args) NP[mode] = len( set( list( itertools.chain.from_iterable( [v[0] for k, v in M.items() if k[1] == 0])))) # change the keys to a string for k, v in M.items(): patient_map[mode]['{},{}'.format( k[0], k[1])] = [list(v[0]), list(v[1])] result[gene_range['gene_id']] = { 'symbol': gene_range['gene_name'], 'patient_map': patient_map, 'NP': NP, } # close cursor gene_ranges.close() # write everything to output with open(kwargs['output'], 'wb') as outf: json.dump(result, outf)
def get_vcf_df(**kwargs): ''' use bcf tools to subset variants and patients. then according to p/v_cutoff to get bad_vs, bad_ps to remove ''' compulsory_keys = { 'vcf_file', 'chrom', 'start', 'stop', 'unrelated_file', 'human_fasta_ref', 'v_cutoff', 'gnomad_cutoff', 'p_cutoff', 'patient_mini', } # check args helper.check_args(compulsory_keys, kwargs, 'get_vcf_df') position = '{chrom}:{start}-{stop}'.format(**kwargs) ps1 = subprocess.Popen(('tabix', '-h', kwargs['vcf_file'], position), stdout=subprocess.PIPE) # subset on unrelated samples, and normalise ps2 = subprocess.Popen(('bcftools', 'view', '-Ou', '-S', kwargs['unrelated_file'], '-f', 'PASS'), stdin=ps1.stdout, stdout=subprocess.PIPE) ps3 = subprocess.Popen(('bcftools', 'norm', '-Ou', '-m', '-any'), stdin=ps2.stdout, stdout=subprocess.PIPE) normed_vcf = subprocess.check_output( ['bcftools', 'norm', '-Ov', '-f', kwargs['human_fasta_ref']], stdin=ps3.stdout) # get vcf df. genotype -1 = missing, 0 = wildtype, 1 = het, 2 = hom genotype_df = read_vcf(normed_vcf) # empty vcf? early return if genotype_df.empty: return None # get poorly covered variants and individuals # change df to cover_df cover_df = genotype_df.copy() cover_df[cover_df >= 0] = 1 cover_df[cover_df == -1] = 0 pm = cover_df.mean() # rid of patients not in patient_mini bad_ps = set(pm[pm < kwargs['p_cutoff']].index) bad_ps.update(set(pm.index) - set(kwargs['patient_mini'].keys())) vm = cover_df.T.mean() bad_vs = set(vm[vm < kwargs['v_cutoff']].index) # annotate vs with gnomad vs = (i for i in vm.index if i not in bad_vs) gnomad_freqs = gnomad_utils.overall_freqs(vs, kwargs['gnomad_path']) # remove variants with 'SEGDUP' filter. This gives a lot of noise for recessive # analysis. For example IGHV3-38 - ENST00000390618, 14-106866588-T-C bad_vs.update([ i for i, v in gnomad_freqs.items() if v['filters']['exome'] is not None and 'SEGDUP' in v['filters']['exome'] or v['filters']['genome'] is not None and 'SEGDUP' in v['filters']['genome'] ]) # in fact, many variants have very high af, but 0 hom_f, such as # 6-32548641-A-T, which has no 'SEGDUP' filter. Remove those # hard filtering for the time being. There might be better ways bad_vs.update([ i for i, v in gnomad_freqs.items() if v['gnomad_af'] > 0.01 and v['gnomad_hom_f'] == 0.0 ]) # add to bad_vs gnomad_hom_af >= gnomad_cutoff, # and those not covered by gnomad_path # Note that if gnomad_hom_af >= gnomad_cutoff, then gnomad_af >= gnomad_cutoff # but not vice versa #this = [i for i,v in gnomad_freqs.items() # if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']] bad_vs.update([ i for i, v in gnomad_freqs.items() if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff'] ]) vs_count = np.sum(genotype_df[genotype_df > 0], axis=1) bad_vs.update([ i for i in gnomad_freqs if vs_count[i] > 3 and gnomad_freqs[i]['pop_filter'] ]) # then drop bad_ps and bad_vs genotype_df.drop(bad_vs, inplace=True) genotype_df.drop(bad_ps, inplace=True, axis=1) return (genotype_df, cover_df, gnomad_freqs)
def main(**kwargs): ''' parameters: genes: optional N (selecting HPO with at least N Ph. affecting both \ #positive (selecting parental HPO in the positive set \ #and negative set) vcf file location gnomad files location patient_mini, patient_info, both are json files cadd path unrelated file used to subset vcf file v cutoff and p cutoff are to remove variants and patients with \ #low coverage over the gene returns hpo goodness of fit score, p_g (gnomad_freq ''' # check args compulsory_keys = { 'remove_nc', 'vcf_file', 'gnomad_path', 'cadd_file', 'patient_mini_file', 'patient_info_file', 'human_fasta_ref', 'unrelated_file', 'v_cutoff', 'p_cutoff', # not using this since no phasing is done 'gnomad_cutoff', 'gnomad_step', 'gnomad_path', 'cadd_step', 'cadd_min', 'cis_gap', } helper.check_args(compulsory_keys, kwargs, 'main') # defaults kwargs.setdefault('gene_inheritance_mode', {}) # get patient_mini and patient_info patient_info = helper.get_snapshot(kwargs['patient_info_file']) patient_mini = helper.get_snapshot(kwargs['patient_mini_file']) # get p_h for all hpos phs = helper.get_phs(patient_info) # get genes, if not provided. get all gene_ids from mongodb, \ #if provided, convert to gene_id fields = { 'gene_id': 1, 'gene_name': 1, '_id': 0, 'chrom': 1, 'start': 1, 'stop': 1, 'xstart': 1, 'xstop': 1, } this = {} # get gnomad and cadd steps gnomad_steps = np.arange(0, kwargs['gnomad_cutoff'] + kwargs['gnomad_step'], kwargs['gnomad_step']) cadd_steps = np.arange(kwargs['cadd_min'], 60, kwargs['cadd_step']) # for each gene, get all valid variants/patients according to p/v_cutoff, # annotate using gnomad # use PV to record patients_variants chrom, crange = kwargs['range'].split(':') start, stop = crange.split('-') # first parse vcf file to get genotype and coverage for each variant vcf_file = kwargs['vcf_file'].format(chrom) args = dict( vcf_file=vcf_file, chrom=chrom, start=start, stop=stop, unrelated_file=kwargs['unrelated_file'], human_fasta_ref=kwargs['human_fasta_ref'], v_cutoff=kwargs['v_cutoff'], p_cutoff=kwargs['p_cutoff'], gnomad_path=kwargs['gnomad_path'], gnomad_cutoff=kwargs['gnomad_cutoff'], patient_mini=patient_mini, ) vcf_dfs = get_vcf_df(**args) if vcf_dfs is None: # no variants, continue return None genotype_df, cover_df, gnomad_freqs = vcf_dfs # then get patients_variants, with variants annotated with # gnomad freqs and cadd args = dict( gnomad_freqs=gnomad_freqs, genotype_df=genotype_df, ) patients_variants = get_patients_variants(**args) # remove noncoding? if kwargs['remove_nc']: helper.remove_noncoding(patients_variants, kwargs) # if no variants left, skip if not patients_variants['variants']: return None # for each gene, remove batch-specific variants args = dict( data=patients_variants, patient_mini=patient_mini, ) batch_specific_variants = helper.get_batch_artefacts(**args) patients_variants = helper.remove_batch_artefacts( patients_variants, batch_specific_variants, patient_mini, ) # add cadd args = dict( variants=patients_variants['variants'], chrom=chrom, start=start, stop=stop, cadd_file=kwargs['cadd_file'], ) cadds = helper.add_cadd(**args) for k, v in cadds.items(): patients_variants['variants'][k]['cadd'] = v # when two variants are in cis and both appear in one patient, # discard the variant with lower cadd in that patient # example SDK1 ('7-3990565-C-T','7-4014039-C-T') # for now, only focus on variants with hom_f < 0.00025 remove_cis(patients_variants, genotype_df) patients_variants['cover_df'] = cover_df # output patients_variants return patients_variants
def signup(): check_args(g.args, 'name', 'password') u = user.new_user(g.args['name'], g.args['password'], g.args) return {'message': 'signup successful', 'user': u}