def get_feat(self, _input): rsid = _input with GenomeBrowserClient(self.db_config_key) as gb_client: coord_dfm = gb_client.fetch_coord(rsid) coord_dfm = remove_dup_on_chrY(coord_dfm) return coord_dfm
def extract_metadata(src, dest_csv, dest_bed): """ Extract metadata for cSNPs or rSNPs by querying UCSC database :param src: the rsID list :param dest_csv: the feature matrix :param dest_bed: the name of bed file to be generated :return: None """ rsid = __read_id(src) with GenomeBrowserClient('local_hg19') as gb_client: snps = gb_client.fetch_metadata(rsid) __print_summary(snps) snps = __remove_non_regular_chrom(snps, verbose=True) snps = __remove_non_single_class(snps, verbose=True) snps = __normalize_allele_strand(snps) snps = __build_allele_freq_map(snps) snps = __identify_major_minor_alleles(snps, verbose=True) snps = __revise_alleles_with_equal_freqs(snps) snps = __drop_redundant_col(snps) snps = __normalize_chrom_coord(snps) snps = CT.remove_dup_on_chrY(snps) snps = snps.set_index("name") __to_csv(snps, dest_csv) snps = snps.reset_index() __to_bed(snps, dest_bed)
def get_feat(self, _input): """ :param _input: the SNP data frame :return: """ snp_dfm = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']] with GenomeBrowserClient(self.db_config_key) as gb_client: tf_df = gb_client.fetch_tf(snp_dfm.loc[:, 'name'].tolist()) snp_dfm = snp_dfm.merge(tf_df, how='left', on=['name', 'chrom'], copy=True) snp_dfm.loc[:, 'tfCount'] = snp_dfm.loc[:, 'tfCount'].fillna(0) """ No prefix for TF column names in osu17; Would prefer a "tf_" prefix for osu18 An all-zero TF `POLR3R`, which `gb_client` won't return in its result, is presented in osu17 feature matrix. """ if self.reproduce_osu17: snp_dfm = self.binary_encode(snp_dfm, cat_col_name="tfName", cat_col_sep=',', bin_col_prefix=None) snp_dfm = snp_dfm.assign(POLR3G=0) else: snp_dfm = self.binary_encode(snp_dfm, cat_col_name="tfName", cat_col_sep=',', bin_col_prefix="tf_") snp_dfm = snp_dfm.drop(['chrom', 'chromStart', 'chromEnd'], axis=1) return snp_dfm
def get_feat(self, _input): """ :param _input: the SNP data frame :return: """ snp_dfm = _input.loc[:, ['chrom', 'name']] with GenomeBrowserClient(self.db_config_key) as gb_client: result = gb_client.identify_genome_seg(snp_dfm.loc[:, 'name']) # result = ct.remove_dup_on_chrY(result) if not self.reproduce_osu17: # Use clearer names for osu18 result = result.rename( columns={ 'ch1Name': 'ChromhmmGm12878', 'ch2Name': 'ChromhmmH1hesc', 'ch3Name': 'ChromhmmHelas3', 'ch4Name': 'ChromhmmHepg2', 'ch5Name': 'ChromhmmHuvec', 'ch6Name': 'ChromhmmK562', 'sw1Name': 'SegwayGm12878', 'sw2Name': 'SegwayH1hesc', 'sw3Name': 'SegwayHelas3', 'sw4Name': 'SegwayHepg2', 'sw5Name': 'SegwayHuvec', 'sw6Name': 'SegwayK562' }) snp_dfm = snp_dfm.merge(result, how='left', on=['name', 'chrom']) return snp_dfm.drop(['chrom'], axis=1).fillna(0)
def __get_candidate_dfm(self, rsid): with GenomeBrowserClient(self.db_config_key) as gb_client: first_run_result = gb_client.compute_tss_dist(rsid, adjacent_bins=1) remainder = rsid[~rsid.isin(first_run_result.loc[:, "name"])] if not remainder.empty: print( "[TssDistUtil]: No distance found for {n} SNP(s) after 1st run: \r\n{snps}" .format(n=remainder.shape[0], snps=remainder.tolist())) second_run_result = gb_client.compute_tss_dist( remainder, adjacent_bins=-1) remainder = remainder[~remainder.isin(second_run_result. loc[:, "name"])] if not remainder.empty: print( "[TssDistUtil]: No distance found for {n} SNP(s) after 2st run: \r\n{snps}" .format(n=remainder.shape[0], snps=remainder.tolist())) # IMPORTANT: do apply `reset_index()` or set `ignore_index = True` here # if not, multiple entries would share a same index # which would be a disaster for `idxmin()` operation in `__minTssDist` # return pandas.concat([firstRunResult, secondRunResult]).reset_index() return pd.concat([first_run_result, second_run_result], ignore_index=True) else: return first_run_result
def get_feat(self, _input): rsid = _input with GenomeBrowserClient(self.db_config_key) as gb_client: gb_df = gb_client.fetch_alleles(rsid) gb_df = remove_dup_on_chrY(gb_df) gb_df = AlleleUtil.transform_cols(gb_df) return gb_df
def pce_filter(snp_dfm, verbose=False): with GenomeBrowserClient('local_hg19') as gb_client: in_pce = snp_dfm.apply(lambda x: gb_client.in_protein_coding_exon( x['chrom'], x['chromStart']), axis=1) if verbose: print( "[pce_filter] found {n} SNP in protein-coding exons: \r\n {dfm}" .format(n=sum(in_pce), dfm=snp_dfm[in_pce])) return snp_dfm.loc[~in_pce]
def get_feat(self, _input): snps = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']] with GenomeBrowserClient(self.db_config_key) as gb_client: vh_list_series = snps.apply(lambda x: gb_client.select_vista_enhancer(x['chrom'], x['chromStart']), axis=1) vh_count = [len(vh_list) for vh_list in vh_list_series] vh_score = [sum([score for _, score in vh_list]) for vh_list in vh_list_series] vh_dfm = pd.DataFrame(data=dict(name=snps['name'], vistaEnhancerCnt=vh_count, vistaEnhancerTotalScore=vh_score)) return vh_dfm
def __yield_feat(self, snp_bed_fn): snp_bed_obj = BedTool(snp_bed_fn) snp_bed_dfm = pd.read_table( snp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) for fn in self.src_data_fn: complement_filename = os.path.join(self.src_data_dir, fn) comp_bed_obj = BedTool(complement_filename) intx = snp_bed_obj.intersect(comp_bed_obj, wb=True) intx_dfm = pd.read_table(StringIO(str(intx)), header=None, names=[ 'snpChrom', 'snpChromStart', 'snpChromEnd', 'snpName', 'blockChrom', 'blockChromStart', 'blockChromEnd', 'compChrom', 'compChromStart', 'compChromEnd' ]) snp_comp = intx_dfm[[ 'snpName', 'compChrom', 'compChromStart', 'compChromEnd' ]] with GenomeBrowserClient(self.db_config_key) as gb_client: tss_dist = snp_comp.apply( lambda row: gb_client.select_tss_dist( row['compChrom'], row['compChromStart'], row[ 'compChromEnd']), axis=1, reduce=True) tss_dist = pd.DataFrame(tss_dist, columns=['tssDist']) tss_dist = pd.concat([snp_comp['snpName'], tss_dist], axis=1) tss_dist = tss_dist.groupby('snpName').agg(sum).reset_index() col_name = fn[:-4] + 'TssDist' if fn.endswith( ".bed") else fn + 'TssDist' # fn[:-4] remove the ".bed" extension tss_dist.loc[:, col_name] = tss_dist.loc[:, 'tssDist'].apply( min_tss_dist) result_dfm = pd.merge(snp_bed_dfm, tss_dist, how='left', left_on='name', right_on='snpName'). \ set_index('name') yield result_dfm[col_name]
def get_feat(self, _input): """ :param _input: the SNP data frame :return: """ snp_dfm = _input with GenomeBrowserClient(self.db_config_key) as gb_client: phastcons_df = gb_client.fetch_phastcons( snp_dfm.loc[:, 'name'].tolist()) snp_dfm = snp_dfm.merge(phastcons_df, how='left', on=['name', 'chrom'], copy=True) return snp_dfm.loc[:, ["name", "phastCons"]].fillna(0)
def get_feat(self, _input): """ :param _input: the SNP data frame in BED format :return: """ snps = _input with GenomeBrowserClient(self.db_config_key) as gb_client: in_lad = snps.apply( lambda x: gb_client.in_nki_lad(x['chrom'], x['chromStart']), axis=1) overlap = pd.DataFrame(data=dict(name=snps['name'], NkiLad=in_lad)) # Rearrange column order overlap = overlap[['name', 'NkiLad']] return overlap
def __faulty_filter_on_allele(rsid, db_config_key): maf_thld = 0.05 with GenomeBrowserClient(db_config_key) as gb_client: snp_allele = gb_client.fetch_alleles(rsid) snp_allele = ct.remove_dup_on_chrY(snp_allele) snp_allele = AlleleUtil.transform_cols(snp_allele) n_allele = snp_allele.loc[:, list("ATCG")].apply(lambda x: sum(x > 0), axis=1) # all mono-allelic are excluded # mono = (n_allele == 1) # all bi-allelic are included # The problem of the previous filter: we didn't apply the "MAF >= 5%" rule to biallelic SNPs bi = (n_allele == 2) # include if min freq < 0.05 (then we can just discard this min freq); # This is faulty because we may include an entry with freq = (0.96, 0.02, 0.02) tri = (n_allele == 3) # include if min 2 freqs < 0.05 (then we can just discard these 2 min freqs); # This is faulty because we may include an entry with freq = (0.94, 0.02, 0.02, 0.02) quad = (n_allele == 4) bi_dfm = snp_allele.loc[bi, :] tri_dfm = snp_allele.loc[tri, :] min_one_under_thld = tri_dfm.loc[:, list("ATCG")].apply( lambda x: min(x[x > 0]) < maf_thld, axis=1, reduce=True) tri_dfm = tri_dfm.loc[min_one_under_thld, :] quad_dfm = snp_allele.loc[quad, :] min_two_under_thld = quad_dfm.loc[:, list("ATCG")].\ apply(lambda x: (x[x > 0].sort_values().iloc[[0, 1]] < maf_thld).all(), axis=1, reduce=True) quad_dfm = quad_dfm.loc[min_two_under_thld, :] # No need to query Biomart because it's known that what Biomart would return is empty # And PCE exclusion is also done after `get_df_1kb` return pd.concat([bi_dfm, tri_dfm, quad_dfm], axis=0)
def test_in_protein_coding_exon(self): with GenomeBrowserClient('local_hg19') as gb_client: is_in = gb_client.in_protein_coding_exon(chrom='chr7', chrom_start=117306990) self.assertTrue(is_in)