Example #1
0
    def get_feat(self, _input):
        rsid = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            coord_dfm = gb_client.fetch_coord(rsid)
            coord_dfm = remove_dup_on_chrY(coord_dfm)
            return coord_dfm
Example #2
0
def extract_metadata(src, dest_csv, dest_bed):
    """
    Extract metadata for cSNPs or rSNPs by querying UCSC database

    :param src: the rsID list
    :param dest_csv: the feature matrix
    :param dest_bed: the name of bed file to be generated
    :return: None
    """

    rsid = __read_id(src)

    with GenomeBrowserClient('local_hg19') as gb_client:
        snps = gb_client.fetch_metadata(rsid)

        __print_summary(snps)

        snps = __remove_non_regular_chrom(snps, verbose=True)
        snps = __remove_non_single_class(snps, verbose=True)
        snps = __normalize_allele_strand(snps)
        snps = __build_allele_freq_map(snps)
        snps = __identify_major_minor_alleles(snps, verbose=True)
        snps = __revise_alleles_with_equal_freqs(snps)
        snps = __drop_redundant_col(snps)
        snps = __normalize_chrom_coord(snps)
        snps = CT.remove_dup_on_chrY(snps)

        snps = snps.set_index("name")
        __to_csv(snps, dest_csv)

        snps = snps.reset_index()
        __to_bed(snps, dest_bed)
Example #3
0
    def get_feat(self, _input):
        """
        :param _input: the SNP data frame
        :return:
        """
        snp_dfm = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']]

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            tf_df = gb_client.fetch_tf(snp_dfm.loc[:, 'name'].tolist())

        snp_dfm = snp_dfm.merge(tf_df,
                                how='left',
                                on=['name', 'chrom'],
                                copy=True)

        snp_dfm.loc[:, 'tfCount'] = snp_dfm.loc[:, 'tfCount'].fillna(0)
        """
        No prefix for TF column names in osu17; Would prefer a "tf_" prefix for osu18
        An all-zero TF `POLR3R`, which `gb_client` won't return in its result, is presented in osu17 feature matrix.
        """
        if self.reproduce_osu17:
            snp_dfm = self.binary_encode(snp_dfm,
                                         cat_col_name="tfName",
                                         cat_col_sep=',',
                                         bin_col_prefix=None)
            snp_dfm = snp_dfm.assign(POLR3G=0)
        else:
            snp_dfm = self.binary_encode(snp_dfm,
                                         cat_col_name="tfName",
                                         cat_col_sep=',',
                                         bin_col_prefix="tf_")

        snp_dfm = snp_dfm.drop(['chrom', 'chromStart', 'chromEnd'], axis=1)

        return snp_dfm
Example #4
0
    def get_feat(self, _input):
        """

        :param _input: the SNP data frame
        :return:
        """
        snp_dfm = _input.loc[:, ['chrom', 'name']]

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            result = gb_client.identify_genome_seg(snp_dfm.loc[:, 'name'])

            # result = ct.remove_dup_on_chrY(result)

            if not self.reproduce_osu17:
                # Use clearer names for osu18
                result = result.rename(
                    columns={
                        'ch1Name': 'ChromhmmGm12878',
                        'ch2Name': 'ChromhmmH1hesc',
                        'ch3Name': 'ChromhmmHelas3',
                        'ch4Name': 'ChromhmmHepg2',
                        'ch5Name': 'ChromhmmHuvec',
                        'ch6Name': 'ChromhmmK562',
                        'sw1Name': 'SegwayGm12878',
                        'sw2Name': 'SegwayH1hesc',
                        'sw3Name': 'SegwayHelas3',
                        'sw4Name': 'SegwayHepg2',
                        'sw5Name': 'SegwayHuvec',
                        'sw6Name': 'SegwayK562'
                    })

            snp_dfm = snp_dfm.merge(result, how='left', on=['name', 'chrom'])

            return snp_dfm.drop(['chrom'], axis=1).fillna(0)
Example #5
0
    def __get_candidate_dfm(self, rsid):
        with GenomeBrowserClient(self.db_config_key) as gb_client:
            first_run_result = gb_client.compute_tss_dist(rsid,
                                                          adjacent_bins=1)

            remainder = rsid[~rsid.isin(first_run_result.loc[:, "name"])]
            if not remainder.empty:
                print(
                    "[TssDistUtil]: No distance found for {n} SNP(s) after 1st run: \r\n{snps}"
                    .format(n=remainder.shape[0], snps=remainder.tolist()))

                second_run_result = gb_client.compute_tss_dist(
                    remainder, adjacent_bins=-1)

                remainder = remainder[~remainder.isin(second_run_result.
                                                      loc[:, "name"])]

                if not remainder.empty:
                    print(
                        "[TssDistUtil]: No distance found for {n} SNP(s) after 2st run: \r\n{snps}"
                        .format(n=remainder.shape[0], snps=remainder.tolist()))

                # IMPORTANT: do apply `reset_index()` or set `ignore_index = True` here
                # if not, multiple entries would share a same index
                #   which would be a disaster for `idxmin()` operation in `__minTssDist`
                # return pandas.concat([firstRunResult, secondRunResult]).reset_index()
                return pd.concat([first_run_result, second_run_result],
                                 ignore_index=True)
            else:
                return first_run_result
Example #6
0
    def get_feat(self, _input):
        rsid = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            gb_df = gb_client.fetch_alleles(rsid)
            gb_df = remove_dup_on_chrY(gb_df)

            gb_df = AlleleUtil.transform_cols(gb_df)

            return gb_df
Example #7
0
    def pce_filter(snp_dfm, verbose=False):
        with GenomeBrowserClient('local_hg19') as gb_client:
            in_pce = snp_dfm.apply(lambda x: gb_client.in_protein_coding_exon(
                x['chrom'], x['chromStart']),
                                   axis=1)

        if verbose:
            print(
                "[pce_filter] found {n} SNP in protein-coding exons: \r\n {dfm}"
                .format(n=sum(in_pce), dfm=snp_dfm[in_pce]))

        return snp_dfm.loc[~in_pce]
    def get_feat(self, _input):
        snps = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']]

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            vh_list_series = snps.apply(lambda x: gb_client.select_vista_enhancer(x['chrom'], x['chromStart']), axis=1)
            vh_count = [len(vh_list) for vh_list in vh_list_series]
            vh_score = [sum([score for _, score in vh_list]) for vh_list in vh_list_series]

            vh_dfm = pd.DataFrame(data=dict(name=snps['name'],
                                            vistaEnhancerCnt=vh_count,
                                            vistaEnhancerTotalScore=vh_score))
            return vh_dfm
Example #9
0
    def __yield_feat(self, snp_bed_fn):
        snp_bed_obj = BedTool(snp_bed_fn)
        snp_bed_dfm = pd.read_table(
            snp_bed_fn,
            header=None,
            names=['chrom', 'chromStart', 'chromEnd', 'name'])

        for fn in self.src_data_fn:
            complement_filename = os.path.join(self.src_data_dir, fn)

            comp_bed_obj = BedTool(complement_filename)
            intx = snp_bed_obj.intersect(comp_bed_obj, wb=True)

            intx_dfm = pd.read_table(StringIO(str(intx)),
                                     header=None,
                                     names=[
                                         'snpChrom', 'snpChromStart',
                                         'snpChromEnd', 'snpName',
                                         'blockChrom', 'blockChromStart',
                                         'blockChromEnd', 'compChrom',
                                         'compChromStart', 'compChromEnd'
                                     ])
            snp_comp = intx_dfm[[
                'snpName', 'compChrom', 'compChromStart', 'compChromEnd'
            ]]

            with GenomeBrowserClient(self.db_config_key) as gb_client:
                tss_dist = snp_comp.apply(
                    lambda row: gb_client.select_tss_dist(
                        row['compChrom'], row['compChromStart'], row[
                            'compChromEnd']),
                    axis=1,
                    reduce=True)
                tss_dist = pd.DataFrame(tss_dist, columns=['tssDist'])
                tss_dist = pd.concat([snp_comp['snpName'], tss_dist], axis=1)
                tss_dist = tss_dist.groupby('snpName').agg(sum).reset_index()

                col_name = fn[:-4] + 'TssDist' if fn.endswith(
                    ".bed") else fn + 'TssDist'

                # fn[:-4] remove the ".bed" extension
                tss_dist.loc[:, col_name] = tss_dist.loc[:, 'tssDist'].apply(
                    min_tss_dist)

                result_dfm = pd.merge(snp_bed_dfm, tss_dist, how='left', left_on='name', right_on='snpName'). \
                    set_index('name')

                yield result_dfm[col_name]
Example #10
0
    def get_feat(self, _input):
        """
        :param _input: the SNP data frame
        :return:
        """
        snp_dfm = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            phastcons_df = gb_client.fetch_phastcons(
                snp_dfm.loc[:, 'name'].tolist())

        snp_dfm = snp_dfm.merge(phastcons_df,
                                how='left',
                                on=['name', 'chrom'],
                                copy=True)

        return snp_dfm.loc[:, ["name", "phastCons"]].fillna(0)
Example #11
0
    def get_feat(self, _input):
        """

        :param _input: the SNP data frame in BED format
        :return:
        """
        snps = _input

        with GenomeBrowserClient(self.db_config_key) as gb_client:
            in_lad = snps.apply(
                lambda x: gb_client.in_nki_lad(x['chrom'], x['chromStart']),
                axis=1)

            overlap = pd.DataFrame(data=dict(name=snps['name'], NkiLad=in_lad))
            # Rearrange column order
            overlap = overlap[['name', 'NkiLad']]
            return overlap
Example #12
0
def __faulty_filter_on_allele(rsid, db_config_key):
    maf_thld = 0.05

    with GenomeBrowserClient(db_config_key) as gb_client:
        snp_allele = gb_client.fetch_alleles(rsid)
        snp_allele = ct.remove_dup_on_chrY(snp_allele)

    snp_allele = AlleleUtil.transform_cols(snp_allele)

    n_allele = snp_allele.loc[:, list("ATCG")].apply(lambda x: sum(x > 0),
                                                     axis=1)

    # all mono-allelic are excluded
    # mono = (n_allele == 1)

    # all bi-allelic are included
    # The problem of the previous filter: we didn't apply the "MAF >= 5%" rule to biallelic SNPs
    bi = (n_allele == 2)

    # include if min freq < 0.05 (then we can just discard this min freq);
    # This is faulty because we may include an entry with freq = (0.96, 0.02, 0.02)
    tri = (n_allele == 3)

    # include if min 2 freqs < 0.05 (then we can just discard these 2 min freqs);
    # This is faulty because we may include an entry with freq = (0.94, 0.02, 0.02, 0.02)
    quad = (n_allele == 4)

    bi_dfm = snp_allele.loc[bi, :]

    tri_dfm = snp_allele.loc[tri, :]
    min_one_under_thld = tri_dfm.loc[:, list("ATCG")].apply(
        lambda x: min(x[x > 0]) < maf_thld, axis=1, reduce=True)
    tri_dfm = tri_dfm.loc[min_one_under_thld, :]

    quad_dfm = snp_allele.loc[quad, :]
    min_two_under_thld = quad_dfm.loc[:, list("ATCG")].\
        apply(lambda x: (x[x > 0].sort_values().iloc[[0, 1]] < maf_thld).all(), axis=1, reduce=True)
    quad_dfm = quad_dfm.loc[min_two_under_thld, :]

    # No need to query Biomart because it's known that what Biomart would return is empty

    # And PCE exclusion is also done after `get_df_1kb`

    return pd.concat([bi_dfm, tri_dfm, quad_dfm], axis=0)
Example #13
0
 def test_in_protein_coding_exon(self):
     with GenomeBrowserClient('local_hg19') as gb_client:
         is_in = gb_client.in_protein_coding_exon(chrom='chr7', chrom_start=117306990)
         self.assertTrue(is_in)