Beispiel #1
0
def get_rsid_list():
    snp_dir = get_path("vertex/SNP")
    snp_fn = "osu19_SNP.bed"
    rsid_list = pd.read_csv(os.path.join(snp_dir, snp_fn),
                            sep="\t",
                            header=None,
                            usecols=[3],
                            names=["name"],
                            squeeze=True).tolist()

    return rsid_list
Beispiel #2
0
def read_coexpedia_dfms(_dir):
    for fn in os.listdir(_dir):
        path = os.path.join(_dir, fn)
        dfm = pd.read_csv(path,
                          sep="\t",
                          header=None,
                          names=["Gene_A", "Gene_B", "LLS"])

        geo_accession_id = fn.split(".txt")[0]
        dfm = dfm.assign(GEO_ACCESSION=geo_accession_id)

        yield dfm


if __name__ == "__main__":
    res_dir = get_path("resource/Coexpedia")
    dfms = read_coexpedia_dfms(os.path.join(res_dir, "Hsa"))

    merged_dfm = pd.concat(dfms, axis=0)
    merged_dfm.to_csv(os.path.join(res_dir, "p1_coexpedia_merged.tsv"),
                      sep="\t",
                      index=False)

    reduced_dfm = merged_dfm.loc[:, ["Gene_A", "Gene_B"]]
    reduced_dfm = remove_duplicated_undirected_edges(reduced_dfm, sort=True)

    edge_dir = get_path("edge/gene-gene")
    reduced_dfm.to_csv(os.path.join(edge_dir, "Coexpedia.edgelist"),
                       sep="\t",
                       index=False,
                       header=False)
Beispiel #3
0
    return df_a_minus_b

def output_snp_gene_edgelist(snp_gene_map):             
    snp_gene_el = snp_gene_map.loc[:, ["rs_id", "gene_ensembl_id"]].drop_duplicates()

    snp_gene_el = map_Ensembl_IDs_to_Entrez(snp_gene_el, ensembl_colname="gene_ensembl_id", new_colname="gene_id", keep_unmapped=True)
    snp_gene_el.sort_values(by="gene_id", inplace=True)

    return snp_gene_el

def output_snp_snp_edgelist(snp_snp_map):
    return snp_snp_map.loc[:, ["rs_id_A", "rs_id_B"]].drop_duplicates()


if __name__ == "__main__":
    _4DGenome_dir = get_path("resource/4DGenome")

    snp_in_interactorA = get_interacted_SNPs(os.path.join(_4DGenome_dir, "InteractorA_SNP_intxn.bed"))
    snp_in_interactorB = get_interacted_SNPs(os.path.join(_4DGenome_dir, "InteractorB_SNP_intxn.bed"))

    gene_tss_in_interactorA = get_genes_with_interacted_TSSs(os.path.join(_4DGenome_dir, "InteractorA_Ensembl_TSS_intxn.bed"))
    gene_tss_in_interactorB = get_genes_with_interacted_TSSs(os.path.join(_4DGenome_dir, "InteractorB_Ensembl_TSS_intxn.bed"))

    gene_prm_in_interactorA = get_genes_with_interacted_promoters(os.path.join(_4DGenome_dir, "InteractorA_Ensembl_promoter_intxn.bed"))
    gene_prm_in_interactorB = get_genes_with_interacted_promoters(os.path.join(_4DGenome_dir, "InteractorB_Ensembl_promoter_intxn.bed"))

    # interacted (SNP, gene) pairs (via TSSs/promoters)
    snp_tss_map = _cross_inner_join(snp_in_interactorA, snp_in_interactorB, gene_tss_in_interactorA, gene_tss_in_interactorB)
    snp_prm_map = _cross_inner_join(snp_in_interactorA, snp_in_interactorB, gene_prm_in_interactorA, gene_prm_in_interactorB)

    # remove redundant (SNP, gene) pairs that are both associated by TSSs and promoters
Beispiel #4
0
    eqtl_fn_list = list_files(eqtl_dir, eqtl_suffix)

    for eqtl_fn in eqtl_fn_list:
        eqtl_path = os.path.join(eqtl_dir, eqtl_fn)
        eqtl_df = read_eqtl_df(eqtl_path)

        snp_egene_map = snp_df.merge(eqtl_df, on="rs_id", how="inner")

        eqtl_source = eqtl_fn.split(eqtl_suffix)[0]
        snp_egene_map = snp_egene_map.assign(eqtl_source=eqtl_source)

        yield snp_egene_map


if __name__ == "__main__":
    snp_dir = get_path("vertex/SNP")
    snp_fn = "osu19_SNP.bed"
    snp_df = read_snp_df(os.path.join(snp_dir, snp_fn))

    res_dir = get_path("resource/GTEx")
    eqtl_dir = os.path.join(res_dir, "GTEx_Analysis_v7_eQTL")
    eqtl_suffix = ".v7.egenes.txt"

    snp_egene_map = pd.concat(gen_snp_egene_map(snp_df, eqtl_dir, eqtl_suffix),
                              ignore_index=True)
    # Get rid of version numbers
    snp_egene_map.loc[:,
                      "gene_ensembl_id"] = snp_egene_map.loc[:,
                                                             "gene_ensembl_id"].apply(
                                                                 lambda x: x.
                                                                 split(".")[0])
Beispiel #5
0
import os
import pandas as pd
from util_path import get_path

_gene_dir = get_path("vertex/gene")
_ensembl_entrez_map = pd.read_csv(os.path.join(_gene_dir,
                                               "Ensembl_x_Entrez.tsv"),
                                  sep="\t",
                                  dtype={
                                      "Ensembl_Gene_ID": str,
                                      "Entrez_Gene_ID": str
                                  })
# rename columns in order to lower the possibility of naming conflicts
_ensembl_entrez_map.rename(columns={
    "Ensembl_Gene_ID": "_Ensembl_Gene_ID",
    "Entrez_Gene_ID": "_Entrez_Gene_ID"
},
                           inplace=True)


def get_mapped_Ensembl_IDs():
    return set(_ensembl_entrez_map._Ensembl_Gene_ID)


def map_Ensembl_IDs_to_Entrez(df,
                              ensembl_colname,
                              new_colname,
                              keep_unmapped=False):
    """
    Please make sure that no column in `df` is named "_Ensembl_Gene_ID" or "_Entrez_Gene_ID"
    """
Beispiel #6
0
def get_rsid_list():
    snp_dir = get_path("vertex/SNP")
    snp_fn = "osu19_SNP.bed"
    rsid_list = pd.read_csv(os.path.join(snp_dir, snp_fn),
                            sep="\t",
                            header=None,
                            usecols=[3],
                            names=["name"],
                            squeeze=True).tolist()

    return rsid_list


if __name__ == "__main__":
    res_dir = get_path("resource/EncodeTFBS")

    rsid_list = get_rsid_list()
    snp_tfbs_matrix = get_snp_tfbs_matrix(rsid_list, config_key="local_hg19")
    snp_tfbs_matrix = update_tfbs_symbol(snp_tfbs_matrix)
    snp_tfbs_matrix.to_csv(os.path.join(res_dir, "p1_SNP_x_TFBS_matrix.tsv"),
                           sep="\t",
                           header=True,
                           index=True)

    snp_tfbs_map = convert_to_map(snp_tfbs_matrix,
                                  map_colnames=["name", "symbol"])
    snp_tfbs_map.to_csv(os.path.join(res_dir, "p2_SNP_x_TFBS_map.tsv"),
                        sep="\t",
                        header=True,
                        index=False)
Beispiel #7
0
import os
import pandas as pd
from util_path import get_path
from util_edge import remove_duplicated_undirected_edges

if __name__ == "__main__":
    input_dir = get_path("resource/BioGRID")
    edgelist_df = pd.read_csv(
        os.path.join(input_dir, "BIOGRID-Human.tab2.tsv"),
        sep="\t",
        usecols=["Entrez Gene Interactor A", "Entrez Gene Interactor B"])

    edgelist_df = remove_duplicated_undirected_edges(edgelist_df, sort=True)

    output_dir = get_path("edge/gene-gene")
    edgelist_df.to_csv(os.path.join(output_dir, "BioGRID.edgelist"),
                       sep="\t",
                       index=False,
                       header=False)
Beispiel #8
0
import os
import pandas as pd
from util_path import get_path

_dei_dir = get_path("resource/Util_DEI")
_dei_set = set(
    pd.read_csv(os.path.join(_dei_dir, "discontinued_entrez_id.tsv"),
                sep="\t",
                usecols=["Discontinued_GeneID"],
                dtype={
                    "Discontinued_GeneID": int
                }).Discontinued_GeneID)


def filter_dei(df, id_col):
    global _dei_set

    found_dei = set(df[id_col]).intersection(_dei_set)

    if found_dei:
        print("[util_dei] Found Disctinued Entrez ID: {}".format(found_dei))

    flag_dei = df[id_col].isin(found_dei)

    return df.loc[~flag_dei, :]
Beispiel #9
0
import os
import pandas as pd
from util_path import get_path
from util_edge import remove_duplicated_undirected_edges

if __name__ == "__main__":
    input_dir = get_path("resource/HumanNet")
    edgelist_df = pd.read_csv(os.path.join(input_dir, "HumanNet-XN.tsv"),
                              sep="\t",
                              comment="#",
                              names=["Gene_A", "Gene_B", "LLS"],
                              usecols=["Gene_A", "Gene_B"])

    edgelist_df = remove_duplicated_undirected_edges(edgelist_df, sort=True)

    output_dir = get_path("edge/gene-gene")
    edgelist_df.to_csv(os.path.join(output_dir, "HumanNet.edgelist"),
                       sep="\t",
                       index=False,
                       header=False)
Beispiel #10
0

def convert_to_df(closest_obj):
    return pd.read_csv(StringIO(str(closest_obj)),
                       sep="\t",
                       header=None,
                       names=[
                           "snpChrom", "snpChromStart", "snpChromEnd",
                           "snpName", "geneChrom", "geneChromStart",
                           "geneChromEnd", "geneName", "geneID", "geneStrand",
                           "distance"
                       ])


if __name__ == "__main__":
    snp_dir = get_path("vertex/SNP")
    snp_fn = "osu19_SNP.bed"
    snp_start_bed = make_snp_start_BED(os.path.join(snp_dir, snp_fn))

    gene_dir = get_path("vertex/gene")
    ensembl_TSS_fn = "Ensembl_TSS.bed"
    ensembl_TSS_start_bed = make_TSS_start_BED(
        os.path.join(gene_dir, ensembl_TSS_fn))
    entrez_TSS_fn = "Entrez_TSS.bed"
    entrez_TSS_start_bed = make_TSS_start_BED(
        os.path.join(gene_dir, entrez_TSS_fn))

    snp_start_bt = convert_to_bedtool_obj(snp_start_bed)
    ensembl_TSS_start_bt = convert_to_bedtool_obj(ensembl_TSS_start_bed)
    entrez_TSS_start_bt = convert_to_bedtool_obj(entrez_TSS_start_bed)
Beispiel #11
0
import os
import pandas as pd
import mygene
from util_path import get_path
from util_dei import filter_dei

res_dir = get_path("resource/Entrez")
gene_dir = get_path("vertex/gene")

mg = mygene.MyGeneInfo()

def read_gene2ensembl():
    global res_dir

    g2e_df = pd.read_csv(os.path.join(res_dir, "gene2ensembl_9606.tsv"), sep="\t", header=None, 
                         names=["Tax_ID", "Entrez_Gene_ID", "Ensembl_Gene_ID"])

    unique_tax_ids = g2e_df.Tax_ID.unique()
    assert(len(unique_tax_ids) == 1)
    assert(unique_tax_ids[0] == 9606)

    g2e_df = g2e_df.drop("Tax_ID", axis=1).drop_duplicates().reindex()

    return g2e_df

def read_biomart(GRCh):
    global res_dir

    if GRCh == "37":
        filename = "GRCh37_p13_mart_export.txt"
    elif GRCh == "38":