Ejemplo n.º 1
0
def extract_metadata(src, dest_csv, dest_bed):
    """
    Extract metadata for cSNPs or rSNPs by querying UCSC database

    :param src: the rsID list
    :param dest_csv: the feature matrix
    :param dest_bed: the name of bed file to be generated
    :return: None
    """

    rsid = __read_id(src)

    with GenomeBrowserClient('local_hg19') as gb_client:
        snps = gb_client.fetch_metadata(rsid)

        __print_summary(snps)

        snps = __remove_non_regular_chrom(snps, verbose=True)
        snps = __remove_non_single_class(snps, verbose=True)
        snps = __normalize_allele_strand(snps)
        snps = __build_allele_freq_map(snps)
        snps = __identify_major_minor_alleles(snps, verbose=True)
        snps = __revise_alleles_with_equal_freqs(snps)
        snps = __drop_redundant_col(snps)
        snps = __normalize_chrom_coord(snps)
        snps = CT.remove_dup_on_chrY(snps)

        snps = snps.set_index("name")
        __to_csv(snps, dest_csv)

        snps = snps.reset_index()
        __to_bed(snps, dest_bed)
Ejemplo n.º 2
0
    def get_feat(self, _input):
        rsid = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            coord_dfm = gb_client.fetch_coord(rsid)
            coord_dfm = remove_dup_on_chrY(coord_dfm)
            return coord_dfm
Ejemplo n.º 3
0
    def get_feat(self, _input):
        rsid = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            gb_df = gb_client.fetch_alleles(rsid)
            gb_df = remove_dup_on_chrY(gb_df)

            gb_df = AlleleUtil.transform_cols(gb_df)

            return gb_df
Ejemplo n.º 4
0
 def testRemoveDupOnChrY(self):
     dfm = pandas.DataFrame({
         'name': ["rs1", "rs2", "rs3", "rs3"], 
         'chrom': ["chr1", "chr2", "chr3", "chrY"], 
         "tssDistance": [1, 2, 3, 4]
     })
     
     dedup = remove_dup_on_chrY(dfm)
     
     self.assertEqual(dedup.shape[0], 3)
     self.assertFalse((dedup["chrom"] == "chrY").any())
     self.assertEqual((dedup["name"] == "rs3").sum(), 1)
Ejemplo n.º 5
0
def __faulty_filter_on_allele(rsid, db_config_key):
    maf_thld = 0.05

    with GenomeBrowserClient(db_config_key) as gb_client:
        snp_allele = gb_client.fetch_alleles(rsid)
        snp_allele = ct.remove_dup_on_chrY(snp_allele)

    snp_allele = AlleleUtil.transform_cols(snp_allele)

    n_allele = snp_allele.loc[:, list("ATCG")].apply(lambda x: sum(x > 0),
                                                     axis=1)

    # all mono-allelic are excluded
    # mono = (n_allele == 1)

    # all bi-allelic are included
    # The problem of the previous filter: we didn't apply the "MAF >= 5%" rule to biallelic SNPs
    bi = (n_allele == 2)

    # include if min freq < 0.05 (then we can just discard this min freq);
    # This is faulty because we may include an entry with freq = (0.96, 0.02, 0.02)
    tri = (n_allele == 3)

    # include if min 2 freqs < 0.05 (then we can just discard these 2 min freqs);
    # This is faulty because we may include an entry with freq = (0.94, 0.02, 0.02, 0.02)
    quad = (n_allele == 4)

    bi_dfm = snp_allele.loc[bi, :]

    tri_dfm = snp_allele.loc[tri, :]
    min_one_under_thld = tri_dfm.loc[:, list("ATCG")].apply(
        lambda x: min(x[x > 0]) < maf_thld, axis=1, reduce=True)
    tri_dfm = tri_dfm.loc[min_one_under_thld, :]

    quad_dfm = snp_allele.loc[quad, :]
    min_two_under_thld = quad_dfm.loc[:, list("ATCG")].\
        apply(lambda x: (x[x > 0].sort_values().iloc[[0, 1]] < maf_thld).all(), axis=1, reduce=True)
    quad_dfm = quad_dfm.loc[min_two_under_thld, :]

    # No need to query Biomart because it's known that what Biomart would return is empty

    # And PCE exclusion is also done after `get_df_1kb`

    return pd.concat([bi_dfm, tri_dfm, quad_dfm], axis=0)
Ejemplo n.º 6
0
    def maf_filter(snp_dfm,
                   maf_threshold=0.05,
                   use_biomart=False,
                   verbose=False):
        def __is_freq_valid(freq):
            return (freq >= maf_threshold) & (freq > 0)

        def __is_allele(freq):
            return freq > 0

        # We call a SNP not valid if:
        #   CASE 1: it has only one allele;
        #   CASE 2: it has two allele but maf < 0.05 (`self.maf_thld`);
        #   CASE 3: it has three or four allele but you cannot tell which is minor because there are at least 3
        #       alleles with freq >= 0.05. (If there is one allele with freq < 0.05, we simply discard it)
        # SNPs of CASE 1 and CASE 3 will be queried in Biomart for a second chance if you set `self.use_biomart`
        snp_valid = snp_dfm.loc[:, list('ATCG')].apply(
            lambda x: sum(__is_freq_valid(x)) == 2, axis=1, reduce=True)

        if use_biomart:
            snp_biallelic = snp_dfm.loc[:, list('ATCG')].apply(
                lambda x: sum(__is_allele(x)) == 2, axis=1, reduce=True)

            with BiomartClient2() as bm_client:
                bm_df = bm_client.query_snp(
                    rsid_list=snp_dfm.loc[~snp_valid & ~snp_biallelic,
                                          'name'].tolist(),
                    verbose=verbose)
                bm_df = remove_dup_on_chrY(bm_df)

                bm_valid = bm_df.loc[:, list('ATCG')].apply(
                    lambda x: sum(__is_freq_valid(x)) == 2, axis=1)

                df = pd.concat(
                    [snp_dfm.loc[snp_valid, :], bm_df.loc[bm_valid, :]],
                    axis=0)
        else:
            df = snp_dfm.loc[snp_valid, :]

        return df