def get_pop_curse_flag(self, mode, hpo): ''' get pop cursed? return the cursed pop, or None ''' genon = self.genons[mode][hpo] # get inds with small p s_p_inds = np.where(genon[:, 0] <= self.pop_check_p)[0] # get patients, then variants variants = {'pos': [], 'neg': []} tp = None if mode == 'r': tp = 'gnomad_hom_f' elif mode == 'd': tp = 'gnomad_af' else: msg = 'mode has to be either r or d' raise ValueError(msg) for ind in s_p_inds: patients = self.patient_map['patient_map'][mode]["{},0".format( ind)][0] cadd_cuts = (self.cadd_step * ind, self.cadd_step * (ind + 1)) gnomad_cut = self.gnomad_step for p in patients: if hpo in self.patient_info[p]: curse = 'pos' else: curse = 'neg' for v in self.patients_variants['patients'][p]: A = (self.patients_variants['variants'][v][tp] < gnomad_cut) B = (cadd_cuts[0] <= \ self.patients_variants['variants'][v]['cadd'] < \ cadd_cuts[1]) if A and B: variants[curse].append(v) pop_curse = {'pos': set(), 'neg': set()} if len(variants['pos']) < self.pop_flags[1]: # number of variants are too few return None # annotate variants using gnomad_utils, and find pop curse # if pos and neg find same most freq pop, return None gnomad_freqs = gnomad_utils.overall_freqs( variants['pos'] + variants['neg'], self.gnomad_path) for k, v in variants.items(): C = Counter() for vv in v: C.update(gnomad_freqs[vv]['most_freq_pops']) # what if there is a tie?!?! if len(C) == 0: pop_curse[k] = set() continue most_freq = ([C.most_common(1)[0][0]], C.most_common(1)[0][1]) for kk, vv in C.items(): if vv == most_freq[1]: most_freq[0].append(kk) if most_freq[1] / len(v) >= self.pop_flags[0]: pop_curse[k] = set(most_freq[0]) return list(pop_curse['pos'] - pop_curse['neg']) or None
def gnomad(self): # Check local database first. If not, # use CommonFuncs to annotate gnomad, then store in database if getattr(self, '_gnomad', None) is None: if not self.path_to_gnomad: raise ValueError( 'Required to provide a path to gnomad for annotation') # check database db_c = self.db_conn.cursor() result = batch_query(db_c, 'variants', list(self._v.values())) data = {} new_vars = {} gnomad = {} for i in result: temp = dict_factory(db_c, i) if temp['gnomad']: data[temp['id']] = json.loads(temp['gnomad']) for k, v in self._v.items(): if v in data and data[v] != None: gnomad[k] = data[v] else: # not in database, push to array for later query new_vars[k] = v if new_vars: print('querying gnomad') # need to divide vars according to their chroms new_result = {} for chrom_vars in get_chrom_vars(new_vars.values()): new_result.update( gnomad_utils.overall_freqs(chrom_vars, self.path_to_gnomad)) # update database update_db(self.db_conn, 'variants', ['gnomad'], {k: [json.dumps(v)] for k, v in new_result.items()}) # populate exac for k, v in new_vars.items(): gnomad[k] = new_result.get(v, None) self._gnomad = gnomad return self._gnomad
def main(params): # genotype dict genotype_dict = {1: 'het', 2: 'hom'} # read patient info patient = {} patient_header = [] if not params.cadd: # no cadd provided, write to vcf outfile = '{}.vcf'.format(params.gene) else: outfile = '{}.txt'.format(params.gene) with open(PATIENT_CSV, 'rt', encoding='utf-8-sig') as inf: csvreader = csv.reader(inf) for row in csvreader: row = row[:11] if not patient_header: patient_header = row continue record = dict(zip(patient_header, row)) del record['IRDC ID'] patient[row[0]] = record variants = set() report = {} for csvfile in os.listdir(PATH_TO_CSVS): if not csvfile.endswith('.csv'): continue header = [] genotype_header = None with open(os.path.join(PATH_TO_CSVS, csvfile), 'rt') as inf: csvreader = csv.reader(inf) for row in csvreader: if not header: header = row continue record = dict(zip(header, row)) # has gene? genes = record['HUGO.no.splice.info'].split(',') if params.gene in genes: variant = CommonFuncs.find_leftmost_synonymous_variant( CommonFuncs.clean_variant( record['clean.signature'].replace('_', '-'))) variants.add(variant) sample = csvfile.split('.csv')[0] genotype = genotype_dict.get( Counter(record[sample].split(':')[0])['1'], 'unknown') if variant not in report: report[variant] = record report[variant]['samples'] = [{ 'id': sample, 'genotype': genotype }] else: report[variant]['samples'].append({ 'id': sample, 'genotype': genotype }) if not params.cadd: # sort and write vcf with open(outfile, 'wt') as outf: # write vcf header outf.write('##VCF4.1\n') outf.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT']) + '\n') for variant in sorted(list(variants), key=lambda x: int(x.split('-')[1])): chrom, pos, ref, alt = variant.split('-') row = [chrom, pos, '.', ref, alt] outf.write('\t'.join(row) + '\n') else: # get gnomads gnomads = gnomad_utils.overall_freqs(list(variants), PATH_TO_GNOMAD) # get cadd cadds = {} with open(params.cadd, 'rt') as inf: for line in inf: if line.startswith('#'): continue row = line.rstrip().split('\t') cadds['-'.join(row[:4])] = row[-1] # write report with open(outfile, 'wt') as outf: for variant in sorted(list(variants), key=lambda x: int(x.split('-')[1])): outf.write(variant + ':\n') outf.write('\tFilter: {}\n'.format(report[variant]['FILTER'])) outf.write('\t{}\n'.format(report[variant]['AAChange'])) outf.write( '\tPolyphen: {}, SIFT: {}, MutationTaster: {}\n'.format( report[variant]['LJB_PolyPhen2_Pred'], report[variant]['LJB_SIFT_Pred'], report[variant]['LJB_MutationTaster_Pred'])) outf.write('\tgnomad_af:{}, gnomad_hom_f:{}, cadd:{}\n'.format( gnomads[variant]['gnomad_af'], gnomads[variant]['gnomad_hom_f'], cadds[variant])) for sample in report[variant]['samples']: outf.write('\t{} ({}):\n'.format(sample['id'], sample['genotype'])) study_number = sample['id'].split('_')[3] if study_number in patient: for h in patient_header: if h in patient[study_number]: outf.write('\t\t{}: {}\n'.format( h, patient[study_number][h])) outf.write('\n')
def get_vcf_df(**kwargs): ''' use bcf tools to subset variants and patients. then according to p/v_cutoff to get bad_vs, bad_ps to remove ''' compulsory_keys = { 'vcf_file', 'chrom', 'start', 'stop', 'unrelated_file', 'human_fasta_ref', 'v_cutoff', 'gnomad_cutoff', 'p_cutoff', 'patient_mini', } # check args helper.check_args(compulsory_keys, kwargs, 'get_vcf_df') position = '{chrom}:{start}-{stop}'.format(**kwargs) ps1 = subprocess.Popen(('tabix', '-h', kwargs['vcf_file'], position), stdout=subprocess.PIPE) # subset on unrelated samples, and normalise ps2 = subprocess.Popen(('bcftools', 'view', '-Ou', '-S', kwargs['unrelated_file'], '-f', 'PASS'), stdin=ps1.stdout, stdout=subprocess.PIPE) ps3 = subprocess.Popen(('bcftools', 'norm', '-Ou', '-m', '-any'), stdin=ps2.stdout, stdout=subprocess.PIPE) normed_vcf = subprocess.check_output( ['bcftools', 'norm', '-Ov', '-f', kwargs['human_fasta_ref']], stdin=ps3.stdout) # get vcf df. genotype -1 = missing, 0 = wildtype, 1 = het, 2 = hom genotype_df = read_vcf(normed_vcf) # empty vcf? early return if genotype_df.empty: return None # get poorly covered variants and individuals # change df to cover_df cover_df = genotype_df.copy() cover_df[cover_df >= 0] = 1 cover_df[cover_df == -1] = 0 pm = cover_df.mean() # rid of patients not in patient_mini bad_ps = set(pm[pm < kwargs['p_cutoff']].index) bad_ps.update(set(pm.index) - set(kwargs['patient_mini'].keys())) vm = cover_df.T.mean() bad_vs = set(vm[vm < kwargs['v_cutoff']].index) # annotate vs with gnomad vs = (i for i in vm.index if i not in bad_vs) gnomad_freqs = gnomad_utils.overall_freqs(vs, kwargs['gnomad_path']) # remove variants with 'SEGDUP' filter. This gives a lot of noise for recessive # analysis. For example IGHV3-38 - ENST00000390618, 14-106866588-T-C bad_vs.update([ i for i, v in gnomad_freqs.items() if v['filters']['exome'] is not None and 'SEGDUP' in v['filters']['exome'] or v['filters']['genome'] is not None and 'SEGDUP' in v['filters']['genome'] ]) # in fact, many variants have very high af, but 0 hom_f, such as # 6-32548641-A-T, which has no 'SEGDUP' filter. Remove those # hard filtering for the time being. There might be better ways bad_vs.update([ i for i, v in gnomad_freqs.items() if v['gnomad_af'] > 0.01 and v['gnomad_hom_f'] == 0.0 ]) # add to bad_vs gnomad_hom_af >= gnomad_cutoff, # and those not covered by gnomad_path # Note that if gnomad_hom_af >= gnomad_cutoff, then gnomad_af >= gnomad_cutoff # but not vice versa #this = [i for i,v in gnomad_freqs.items() # if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']] bad_vs.update([ i for i, v in gnomad_freqs.items() if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff'] ]) vs_count = np.sum(genotype_df[genotype_df > 0], axis=1) bad_vs.update([ i for i in gnomad_freqs if vs_count[i] > 3 and gnomad_freqs[i]['pop_filter'] ]) # then drop bad_ps and bad_vs genotype_df.drop(bad_vs, inplace=True) genotype_df.drop(bad_ps, inplace=True, axis=1) return (genotype_df, cover_df, gnomad_freqs)
def pop_annotate(line_cache, variant_cache, header, fields, outf, options): import gnomad_utils, bravo_utils, kaviar_utils # annotate # gnomad if 'gnomad_path' in options['pop_freqs']: gnomads = gnomad_utils.overall_freqs( list(variant_cache.keys()), options['pop_freqs']['gnomad_path']) for variant in variant_cache: af = gnomads[variant]['gnomad_af'] if af is None: af = '' hom_f = gnomads[variant]['gnomad_hom_f'] if hom_f is None: hom_f = '' variant_cache[variant]['gnomad_af'] = af variant_cache[variant]['gnomad_hom_f'] = hom_f # bravo if 'bravo_vcf' in options['pop_freqs']: bravos = bravo_utils.bravo(list(variant_cache.keys()), options['pop_freqs']['bravo_vcf']) for variant in variant_cache: if variant not in bravos: variant_cache[variant]['bravo_af'] = '' variant_cache[variant]['bravo_hom_f'] = '' else: variant_cache[variant]['bravo_af'] = \ bravos[variant]['af'] variant_cache[variant]['bravo_hom_f'] = \ bravos[variant]['Hom']*2 / bravos[variant]['an'] # kaviar if 'kaviar_vcf' in options['pop_freqs']: kaviars = kaviar_utils.kaviar(list(variant_cache.keys()), options['pop_freqs']['kaviar_vcf']) for variant in variant_cache: if variant not in kaviars: variant_cache[variant]['kaviar_af'] = '' else: variant_cache[variant]['kaviar_af'] = \ kaviars[variant]['af'] for line in line_cache: record = dict(zip(header, line.rstrip().split('\t'))) INFO = record['INFO'] pop_info = [] for alt in record['ALT'].split(','): v_id = clean_variant('-'.join([ record['CHROM'], record['POS'], record['REF'], alt, ]), human_ref_pysam=options['human_ref_pysam']) pop_info.append( '|'.join([alt] + [str(variant_cache[v_id][f]) for f in fields])) pop_info = 'POPF=' + ','.join(pop_info) new_INFO = ';'.join([INFO, pop_info]) record['INFO'] = new_INFO new_line = '\t'.join([record[h] for h in header]) + '\n' outf.write(new_line)