def get_ann_from_output_snpeff(temp_out_name): callset = allel.read_vcf(temp_out_name, fields='ANN', transformers=allel.ANNTransformer(), \ numbers={'ANN': num_ann_max}) df1 = pd.DataFrame(data=callset['variants/ANN_Allele']) df2 = pd.DataFrame(data=callset['variants/ANN_Annotation']) df3 = pd.concat((df1, df2), axis=1) df3.columns = range(0, df3.shape[1]) return df3
def __get_variants_from_vcf(cls, vcf: str) -> Optional[Dict[str, Any]]: # variants is None precisely when filtered vcf file has no variants try: variants = allel.read_vcf(vcf, fields=cls.FIELD_NAMES, transformers=allel.ANNTransformer()) except IOError: raise FileNotFoundError("File " + vcf + " not found or cannot be opened.") return variants
def save_mutect2(*args): patient = args[0] sample = args[1] path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}" mutect2_name = f"output_transcr_new_predicted_dbsnp.vcf" mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields=["numalt"], alt_number=1) alt_len = max(mutect2_wo_ann["numalt"]) col_list = [ "CHROM", "POS", "ID", "REF", "DP", "FILTER_PASS", "ANN_Annotation", "ANN_Annotation_Impact", "ANN_Gene_Name", "ANN_Gene_ID", "ANN_HGVS_c", "ANN_HGVS_p", "ANN_AA_pos" ] triple_list = [] alt_cols = [ "ALT", "dbNSFP_Polyphen2_HVAR_score", "dbNSFP_SIFT_score", "dbNSFP_MetaLR_score", "dbNSFP_Polyphen2_HDIV_score", "dbNSFP_Uniprot_acc", "dbNSFP_CADD_phred", "dbNSFP_Polyphen2_HDIV_pred", "dbNSFP_MutationTaster_score", "dbNSFP_SIFT_pred", "dbNSFP_MutationTaster_pred", "dbNSFP_Polyphen2_HVAR_pred", "dbNSFP_MetaLR_pred" ] for j in alt_cols: for i in range(1, alt_len + 1): triple_list.append(j + f"_{i}") col_list.append(j + f"_{i}") mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields="*", alt_number=alt_len, exclude_fields="ANN") mutect2_w_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields="ANN", alt_number=alt_len, transformers=allel.ANNTransformer()) mutect2 = pd.concat([mutect2_wo_ann, mutect2_w_ann], axis=1) mutect2 = mutect2[col_list] mutect2.fillna("-", inplace=True) for i in alt_cols: mutect2[i] = mutect2[i+"_1"].map(str)+","+mutect2[i+"_2"].map(str)+","+ \ mutect2[i+"_3"].map(str) mutect2.drop(triple_list, axis=1, inplace=True) return mutect2 mutect2 = pfam_annotate(mutect2) mutect2.to_pickle(os.path.join(path_to_mutect2, "mutect2.pkl"))
def parse_vcf(vcf, rs_ids, bed_file, outputdir, sampleId, vcftools): match_on_rsid = 0 match_on_location = 0 # Slice VCF on bed file temp_vcf_prefix = outputdir + '/' + sampleId + '_PGx' temp_vcf = outputdir + '/' + sampleId + '_PGx.recode.vcf' # Check if output vcf does not already exist if os.path.exists(temp_vcf): raise IOError("Temporary VCF file " + temp_vcf + ".recode.vcf already exists. Exiting.") subprocess.run([vcftools, '--gzvcf', vcf, '--bed', bed_file, '--out', temp_vcf_prefix, '--recode', '--recode-INFO-all']) print("[INFO] Subprocess completed.") # Read in VCF file try: variants = allel.read_vcf(temp_vcf, fields=['samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM', 'variants/FILTER', 'variants/ID', 'variants/POS', 'variants/QUAL', 'variants/REF', 'variants/ANN'], transformers=allel.ANNTransformer()) except IOError: sys.exit("[ERROR] File " + temp_vcf + " not found or cannot be opened.") ids_found_in_patient = pd.DataFrame(columns=['position_GRCh37', 'ref_GRCh37', 'alt_GRCh37', 'rsid', 'variant_annotation', 'gene', 'filter']) for i, rs_number in enumerate(variants['variants/ID']): chr = variants['variants/CHROM'][i] pos = variants['variants/POS'][i] if ";" in rs_number: rs_id_filt = [] cur_rs = rs_number.split(";") for rs in cur_rs: if rs.startswith("rs"): rs_id_filt.append(rs) else: rs_id_filt = [rs_number] if any(rs in rs_id_filt for rs in rs_ids) or str(chr) + ":" + str(pos) in rs_ids.values(): if any(rs in rs_id_filt for rs in rs_ids): match_on_rsid += 1 else: match_on_location += 1 new_id = {} if variants['variants/FILTER_PASS'][i] == True: filter = "PASS" else: filter = "FILTERED" alt = variants['variants/ALT'][i] ref = variants['variants/REF'][i] genotype = "" for geno in variants['calldata/GT'][i][0]: if geno == 0: genotype = genotype + ref elif geno == 1: genotype = genotype + alt[0] elif geno == 2: genotype = genotype + alt[1] else: print(geno) raise ValueError("Genotype looks weird") new_id['position_GRCh37'] = str(chr) + ":" + str(pos) new_id['rsid'] = ";".join(rs_id_filt) new_id['ref_GRCh37'] = genotype[0] new_id['alt_GRCh37'] = genotype[1] new_id['variant_annotation'] = variants['variants/ANN_HGVS_c'][i] new_id['filter'] = filter new_id['gene'] = variants['variants/ANN_Gene_Name'][i] ids_found_in_patient = ids_found_in_patient.append(new_id, ignore_index=True) print("[INFO] Matches on RS id: " + str(match_on_rsid)) print("[INFO] Matches on location: " + str(match_on_location)) return ids_found_in_patient, temp_vcf
print('Variants from TLX3 WGS in active enhacers = ', len(enh_mut_tlx)) # In[36]: # Save only if new analysis done enh_mut_tlx.saveas(join(WGS, 'Ehn_RAG_Active_TLX3_mut.vcf')) # In[37]: # Load table import allel enh_mut_tlx_tb = allel.vcf_to_dataframe(join(WGS, 'Ehn_RAG_Active_TLX3_mut.vcf'), fields='*', numbers={'ALT': 4}, transformers=allel.ANNTransformer()) # In[38]: enh_mut_tlx_tb_fltr = enh_mut_tlx_tb[enh_mut_tlx_tb['FILTER_PASS'] == True] # In[39]: print('Variants from TLX3 WGS in active enhacers = ', len(enh_mut_tlx_tb)) print('Variants from TLX3 WGS in active enhacers PASS filter = ', len(enh_mut_tlx_tb_fltr)) # In[40]: #enh_mut_tlx_tb[['FILTER_map','FILTER_PASS','FILTER_mrd20','FILTER_LowQual','FILTER_mrd10','FILTER_mrd30']].head(40) #tnm = plt.hist(enh_mut_tlx_tb['REF'].apply(len), bins=40)
def vcf_to_tables(vcf_file, genotype_file, variants_tsv, itd_vcf_file=False): # parses a vcf file into a series of tables # if itd_files is given, adds flt3 itd variants to table # load vcf file into numpy array # include annotation info from snpeff vcf = allel.read_vcf(vcf_file, transformers=allel.ANNTransformer(), fields=[ 'variants/*', 'calldata/GT', 'calldata/AD', 'calldata/GQ', 'calldata/DP', 'samples', 'ANN' ]) # layers to extract: # GT: genotype (0: WT, 1: HET, 2: HOM, 3: no call) # DP: total read depth # GQ: genotype quality # AD: alt allele depth # RD: ref allele depth GT = np.sum(vcf['calldata/GT'], axis=2) GT[GT == -2] = 3 DP = np.stack(vcf['calldata/DP'], axis=0) GQ = np.stack(vcf['calldata/GQ'], axis=0) AD = np.stack(vcf['calldata/AD'][:, :, 1], axis=0) RD = np.stack(vcf['calldata/AD'][:, :, 0], axis=0) # create variant names names = [ vcf['variants/ANN_Gene_Name'][i] + ':' + vcf['variants/CHROM'][i] + ':' + str(vcf['variants/POS'][i]) + ':' + vcf['variants/REF'][i] + '/' + vcf['variants/ALT'][:, 0][i] for i in range((vcf['variants/REF'].shape[0])) ] # assemble and save variant annotations to file variants_table = pd.DataFrame(data=names, columns=['Name']) # cosmic id variants_table['COSMIC_ID'] = vcf['variants/ID'] # snpeff columns ANN_columns = [c for c in list(vcf) if '/ANN' in c] for ann in ANN_columns: variants_table['SnpEff_' + ann.split('/ANN_')[1]] = vcf[ann] # clinvar columns CLN_columns = [c for c in list(vcf) if '/CLN' in c] for cln in CLN_columns: variants_table['ClinVar_' + cln.split('/')[1]] = vcf[cln] # optional: add flt3-itd variants to table if itd_vcf_file: # make sure flt3 vcf is not empty empty = True with open(itd_vcf_file, 'r') as f: for line in f: if line[0] != '#': empty = False break if not empty: itd_vcf = allel.read_vcf(itd_vcf_file, fields=['*']) # create itd variant names itd_names = [ 'FLT3-ITD' + ':' + itd_vcf['variants/CHROM'][i] + ':' + str(itd_vcf['variants/POS'][i]) + ':' + itd_vcf['variants/REF'][i] + '/' + itd_vcf['variants/ALT'][:, 0][i] for i in range((itd_vcf['variants/REF'].shape[0])) ] # add itd variant rows to variants table itd_table = pd.DataFrame(data=list(set(itd_names)), columns=['Name']) names += list(set(itd_names)) variants_table = pd.concat([variants_table, itd_table], sort=True) # add itd variants to other layers # set RD = AD and GQ = 100 when itd is present # default for GT is 'no call' (3) # create additional array entries GT = np.concatenate((GT, 3 * np.ones( (itd_table.shape[0], GT.shape[1]))), axis=0) GQ = np.concatenate((GQ, np.zeros( (itd_table.shape[0], GQ.shape[1]))), axis=0) DP = np.concatenate((DP, np.zeros( (itd_table.shape[0], DP.shape[1]))), axis=0) AD = np.concatenate((AD, np.zeros( (itd_table.shape[0], AD.shape[1]))), axis=0) RD = np.concatenate((RD, np.zeros( (itd_table.shape[0], RD.shape[1]))), axis=0) # indices for adding entries to arrays var_ind = dict(zip(names, range(len(names)))) bar_ind = dict(zip(vcf['samples'], range(len(vcf['samples'])))) # for each cell barcode, add entry to genotyping array for i in range(len(itd_vcf['variants/ID'])): cell_barcode = itd_vcf['variants/ID'][i] alt_depth = itd_vcf['variants/QUAL'][i] vaf = itd_vcf['variants/VAF'][i] print vaf total_depth = int(round(np.true_divide(alt_depth, vaf))) # set GT according to vaf # het mut if vaf < 0.9: geno = 1 # hom mut else: geno = 2 # store entries in genotyping array GT[var_ind[itd_names[i]], bar_ind[cell_barcode]] = geno GQ[var_ind[itd_names[i]], bar_ind[cell_barcode]] = 100 DP[var_ind[itd_names[i]], bar_ind[cell_barcode]] = total_depth RD[var_ind[itd_names[i]], bar_ind[cell_barcode]] = total_depth AD[var_ind[itd_names[i]], bar_ind[cell_barcode]] = alt_depth # save variants to file variants_table.to_csv(path_or_buf=variants_tsv, sep='\t', index=False) # encode variant names and cell barcodes names = [n.encode('utf8') for n in names] barcodes = [b.encode('utf8') for b in vcf['samples']] # save genotyping information to compressed hdf5 file with h5py.File(genotype_file, 'w') as f: f.create_dataset('GT', data=GT, dtype='i1', compression='gzip') f.create_dataset('GQ', data=GQ, dtype='i1', compression='gzip') f.create_dataset('DP', data=DP, dtype='i2', compression='gzip') f.create_dataset('AD', data=AD, dtype='i2', compression='gzip') f.create_dataset('RD', data=RD, dtype='i2', compression='gzip') f.create_dataset('VARIANTS', data=names, compression='gzip') f.create_dataset('CELL_BARCODES', data=barcodes, compression='gzip')
def _parse_sv_vcfs(self, vcf_paths, ann_fields=[]): ''' Merge all SV interval data from multiple vcf's in to a single BedTool instance Implementation: Use Panda's dataframe for some easy preprocessing, then create a BedTool from a tuple containing each row ''' def split_Ensembl_ids(id_list): new_list = [] for id in id_list: if '-' in id: new_list.extend(id.split('-')) elif '&' in id: new_list.extend(id.split('&')) else: new_list.append(id) return new_list intervals = [] sample_names = [] ann_dfs = [] index_fields = list( self.index_cols.keys() ) #CHR POS STOP needs to be first 3 columns for creation of BedTool instance sample_sv_fields = index_fields + [ 'calldata/GT', 'variants/ANN_Gene_ID', 'samples' ] parse_fields = list(set(sample_sv_fields + ann_fields)) for vcf_path in vcf_paths: vcf_dict = allel.read_vcf( vcf_path, ['*'], numbers={'ANN': 1000}, transformers=allel.ANNTransformer() ) #use read_vcf because genotype field is not picked up with vcf_to_dataframe assert len(vcf_dict['samples'] ) == 1, "%s contains 0 or more than 1 sample: %s" % ( vcf_path, str(vcf_dict['samples'])) name = vcf_dict.pop('samples')[0] sample_names.append(name) # if 'chr' in CHROM field, remove vcf_dict['variants/CHROM'] = [ chrom.strip('chr') for chrom in vcf_dict['variants/CHROM'] ] # if 'chr' in CHROM field, remove vcf_dict['variants/CHROM'] = [ chrom.strip('chr') for chrom in vcf_dict['variants/CHROM'] ] # drop un-needed fields from vcf, cannot pass in parse_fields to read_vcf() because ANN_gene_id is unknown until ANNTransformer runs for key in list(vcf_dict.keys()): if key not in parse_fields: vcf_dict.pop(key) # remove empty strings, split on delimited characters, then join using comma vcf_dict['variants/ANN_Gene_ID'] = [ list(filter(None, ann)) for ann in vcf_dict['variants/ANN_Gene_ID'] ] #by default, specifying numbers=1000 creates 1000 elements, with most being empty vcf_dict['variants/ANN_Gene_ID'] = [ split_Ensembl_ids(id_list) if any('&' in id for id in id_list) or any('-' in id for id in id_list) else id_list for id_list in vcf_dict['variants/ANN_Gene_ID'] ] vcf_dict['variants/ANN_Gene_ID'] = [ ','.join(list(set(id_list))) if isinstance(id_list, list) else id_list for id_list in vcf_dict['variants/ANN_Gene_ID'] ] vcf_dict['calldata/GT'] = np.array([ 'HET' if 0 in gt and 1 in gt else 'HOM' for gt in vcf_dict.pop('calldata/GT') ]) df = pd.DataFrame(vcf_dict) df['samples'] = name # workaround for START > END so BedTool doesn't freak out using MalformedBedError # START > END is the case for TRV, INV s = df['variants/END'] < df['variants/POS'] df.loc[s, ['variants/END', 'variants/POS']] = df.loc[ s, ['variants/POS', 'variants/END']].values df['variants/POS'] = df['variants/POS'].astype(int) df['variants/END'] = df['variants/END'].astype(int) df = df.drop_duplicates() intervals.extend(df[sample_sv_fields].itertuples(index=False)) if ann_fields: ann_dfs.append(df[index_fields + ann_fields]) ann_df = pd.concat(ann_dfs).astype(str).rename( columns=self.index_cols).set_index(list( self.index_cols.values())) if ann_fields else pd.DataFrame() ann_df = ann_df[~ann_df.index.duplicated( keep='first' )] #annotations for the same SV in a vcf can have slighly differing fields (ex. SVSCORE_MEAN) for i in intervals: print(i) return BedTool(intervals), ann_df, sample_names