def extract_metadata(src, dest_csv, dest_bed): """ Extract metadata for cSNPs or rSNPs by querying UCSC database :param src: the rsID list :param dest_csv: the feature matrix :param dest_bed: the name of bed file to be generated :return: None """ rsid = __read_id(src) with GenomeBrowserClient('local_hg19') as gb_client: snps = gb_client.fetch_metadata(rsid) __print_summary(snps) snps = __remove_non_regular_chrom(snps, verbose=True) snps = __remove_non_single_class(snps, verbose=True) snps = __normalize_allele_strand(snps) snps = __build_allele_freq_map(snps) snps = __identify_major_minor_alleles(snps, verbose=True) snps = __revise_alleles_with_equal_freqs(snps) snps = __drop_redundant_col(snps) snps = __normalize_chrom_coord(snps) snps = CT.remove_dup_on_chrY(snps) snps = snps.set_index("name") __to_csv(snps, dest_csv) snps = snps.reset_index() __to_bed(snps, dest_bed)
def get_feat(self, _input): rsid = _input with GenomeBrowserClient(self.db_config_key) as gb_client: coord_dfm = gb_client.fetch_coord(rsid) coord_dfm = remove_dup_on_chrY(coord_dfm) return coord_dfm
def get_feat(self, _input): rsid = _input with GenomeBrowserClient(self.db_config_key) as gb_client: gb_df = gb_client.fetch_alleles(rsid) gb_df = remove_dup_on_chrY(gb_df) gb_df = AlleleUtil.transform_cols(gb_df) return gb_df
def testRemoveDupOnChrY(self): dfm = pandas.DataFrame({ 'name': ["rs1", "rs2", "rs3", "rs3"], 'chrom': ["chr1", "chr2", "chr3", "chrY"], "tssDistance": [1, 2, 3, 4] }) dedup = remove_dup_on_chrY(dfm) self.assertEqual(dedup.shape[0], 3) self.assertFalse((dedup["chrom"] == "chrY").any()) self.assertEqual((dedup["name"] == "rs3").sum(), 1)
def __faulty_filter_on_allele(rsid, db_config_key): maf_thld = 0.05 with GenomeBrowserClient(db_config_key) as gb_client: snp_allele = gb_client.fetch_alleles(rsid) snp_allele = ct.remove_dup_on_chrY(snp_allele) snp_allele = AlleleUtil.transform_cols(snp_allele) n_allele = snp_allele.loc[:, list("ATCG")].apply(lambda x: sum(x > 0), axis=1) # all mono-allelic are excluded # mono = (n_allele == 1) # all bi-allelic are included # The problem of the previous filter: we didn't apply the "MAF >= 5%" rule to biallelic SNPs bi = (n_allele == 2) # include if min freq < 0.05 (then we can just discard this min freq); # This is faulty because we may include an entry with freq = (0.96, 0.02, 0.02) tri = (n_allele == 3) # include if min 2 freqs < 0.05 (then we can just discard these 2 min freqs); # This is faulty because we may include an entry with freq = (0.94, 0.02, 0.02, 0.02) quad = (n_allele == 4) bi_dfm = snp_allele.loc[bi, :] tri_dfm = snp_allele.loc[tri, :] min_one_under_thld = tri_dfm.loc[:, list("ATCG")].apply( lambda x: min(x[x > 0]) < maf_thld, axis=1, reduce=True) tri_dfm = tri_dfm.loc[min_one_under_thld, :] quad_dfm = snp_allele.loc[quad, :] min_two_under_thld = quad_dfm.loc[:, list("ATCG")].\ apply(lambda x: (x[x > 0].sort_values().iloc[[0, 1]] < maf_thld).all(), axis=1, reduce=True) quad_dfm = quad_dfm.loc[min_two_under_thld, :] # No need to query Biomart because it's known that what Biomart would return is empty # And PCE exclusion is also done after `get_df_1kb` return pd.concat([bi_dfm, tri_dfm, quad_dfm], axis=0)
def maf_filter(snp_dfm, maf_threshold=0.05, use_biomart=False, verbose=False): def __is_freq_valid(freq): return (freq >= maf_threshold) & (freq > 0) def __is_allele(freq): return freq > 0 # We call a SNP not valid if: # CASE 1: it has only one allele; # CASE 2: it has two allele but maf < 0.05 (`self.maf_thld`); # CASE 3: it has three or four allele but you cannot tell which is minor because there are at least 3 # alleles with freq >= 0.05. (If there is one allele with freq < 0.05, we simply discard it) # SNPs of CASE 1 and CASE 3 will be queried in Biomart for a second chance if you set `self.use_biomart` snp_valid = snp_dfm.loc[:, list('ATCG')].apply( lambda x: sum(__is_freq_valid(x)) == 2, axis=1, reduce=True) if use_biomart: snp_biallelic = snp_dfm.loc[:, list('ATCG')].apply( lambda x: sum(__is_allele(x)) == 2, axis=1, reduce=True) with BiomartClient2() as bm_client: bm_df = bm_client.query_snp( rsid_list=snp_dfm.loc[~snp_valid & ~snp_biallelic, 'name'].tolist(), verbose=verbose) bm_df = remove_dup_on_chrY(bm_df) bm_valid = bm_df.loc[:, list('ATCG')].apply( lambda x: sum(__is_freq_valid(x)) == 2, axis=1) df = pd.concat( [snp_dfm.loc[snp_valid, :], bm_df.loc[bm_valid, :]], axis=0) else: df = snp_dfm.loc[snp_valid, :] return df