def get_rsid_list(): snp_dir = get_path("vertex/SNP") snp_fn = "osu19_SNP.bed" rsid_list = pd.read_csv(os.path.join(snp_dir, snp_fn), sep="\t", header=None, usecols=[3], names=["name"], squeeze=True).tolist() return rsid_list
def read_coexpedia_dfms(_dir): for fn in os.listdir(_dir): path = os.path.join(_dir, fn) dfm = pd.read_csv(path, sep="\t", header=None, names=["Gene_A", "Gene_B", "LLS"]) geo_accession_id = fn.split(".txt")[0] dfm = dfm.assign(GEO_ACCESSION=geo_accession_id) yield dfm if __name__ == "__main__": res_dir = get_path("resource/Coexpedia") dfms = read_coexpedia_dfms(os.path.join(res_dir, "Hsa")) merged_dfm = pd.concat(dfms, axis=0) merged_dfm.to_csv(os.path.join(res_dir, "p1_coexpedia_merged.tsv"), sep="\t", index=False) reduced_dfm = merged_dfm.loc[:, ["Gene_A", "Gene_B"]] reduced_dfm = remove_duplicated_undirected_edges(reduced_dfm, sort=True) edge_dir = get_path("edge/gene-gene") reduced_dfm.to_csv(os.path.join(edge_dir, "Coexpedia.edgelist"), sep="\t", index=False, header=False)
return df_a_minus_b def output_snp_gene_edgelist(snp_gene_map): snp_gene_el = snp_gene_map.loc[:, ["rs_id", "gene_ensembl_id"]].drop_duplicates() snp_gene_el = map_Ensembl_IDs_to_Entrez(snp_gene_el, ensembl_colname="gene_ensembl_id", new_colname="gene_id", keep_unmapped=True) snp_gene_el.sort_values(by="gene_id", inplace=True) return snp_gene_el def output_snp_snp_edgelist(snp_snp_map): return snp_snp_map.loc[:, ["rs_id_A", "rs_id_B"]].drop_duplicates() if __name__ == "__main__": _4DGenome_dir = get_path("resource/4DGenome") snp_in_interactorA = get_interacted_SNPs(os.path.join(_4DGenome_dir, "InteractorA_SNP_intxn.bed")) snp_in_interactorB = get_interacted_SNPs(os.path.join(_4DGenome_dir, "InteractorB_SNP_intxn.bed")) gene_tss_in_interactorA = get_genes_with_interacted_TSSs(os.path.join(_4DGenome_dir, "InteractorA_Ensembl_TSS_intxn.bed")) gene_tss_in_interactorB = get_genes_with_interacted_TSSs(os.path.join(_4DGenome_dir, "InteractorB_Ensembl_TSS_intxn.bed")) gene_prm_in_interactorA = get_genes_with_interacted_promoters(os.path.join(_4DGenome_dir, "InteractorA_Ensembl_promoter_intxn.bed")) gene_prm_in_interactorB = get_genes_with_interacted_promoters(os.path.join(_4DGenome_dir, "InteractorB_Ensembl_promoter_intxn.bed")) # interacted (SNP, gene) pairs (via TSSs/promoters) snp_tss_map = _cross_inner_join(snp_in_interactorA, snp_in_interactorB, gene_tss_in_interactorA, gene_tss_in_interactorB) snp_prm_map = _cross_inner_join(snp_in_interactorA, snp_in_interactorB, gene_prm_in_interactorA, gene_prm_in_interactorB) # remove redundant (SNP, gene) pairs that are both associated by TSSs and promoters
eqtl_fn_list = list_files(eqtl_dir, eqtl_suffix) for eqtl_fn in eqtl_fn_list: eqtl_path = os.path.join(eqtl_dir, eqtl_fn) eqtl_df = read_eqtl_df(eqtl_path) snp_egene_map = snp_df.merge(eqtl_df, on="rs_id", how="inner") eqtl_source = eqtl_fn.split(eqtl_suffix)[0] snp_egene_map = snp_egene_map.assign(eqtl_source=eqtl_source) yield snp_egene_map if __name__ == "__main__": snp_dir = get_path("vertex/SNP") snp_fn = "osu19_SNP.bed" snp_df = read_snp_df(os.path.join(snp_dir, snp_fn)) res_dir = get_path("resource/GTEx") eqtl_dir = os.path.join(res_dir, "GTEx_Analysis_v7_eQTL") eqtl_suffix = ".v7.egenes.txt" snp_egene_map = pd.concat(gen_snp_egene_map(snp_df, eqtl_dir, eqtl_suffix), ignore_index=True) # Get rid of version numbers snp_egene_map.loc[:, "gene_ensembl_id"] = snp_egene_map.loc[:, "gene_ensembl_id"].apply( lambda x: x. split(".")[0])
import os import pandas as pd from util_path import get_path _gene_dir = get_path("vertex/gene") _ensembl_entrez_map = pd.read_csv(os.path.join(_gene_dir, "Ensembl_x_Entrez.tsv"), sep="\t", dtype={ "Ensembl_Gene_ID": str, "Entrez_Gene_ID": str }) # rename columns in order to lower the possibility of naming conflicts _ensembl_entrez_map.rename(columns={ "Ensembl_Gene_ID": "_Ensembl_Gene_ID", "Entrez_Gene_ID": "_Entrez_Gene_ID" }, inplace=True) def get_mapped_Ensembl_IDs(): return set(_ensembl_entrez_map._Ensembl_Gene_ID) def map_Ensembl_IDs_to_Entrez(df, ensembl_colname, new_colname, keep_unmapped=False): """ Please make sure that no column in `df` is named "_Ensembl_Gene_ID" or "_Entrez_Gene_ID" """
def get_rsid_list(): snp_dir = get_path("vertex/SNP") snp_fn = "osu19_SNP.bed" rsid_list = pd.read_csv(os.path.join(snp_dir, snp_fn), sep="\t", header=None, usecols=[3], names=["name"], squeeze=True).tolist() return rsid_list if __name__ == "__main__": res_dir = get_path("resource/EncodeTFBS") rsid_list = get_rsid_list() snp_tfbs_matrix = get_snp_tfbs_matrix(rsid_list, config_key="local_hg19") snp_tfbs_matrix = update_tfbs_symbol(snp_tfbs_matrix) snp_tfbs_matrix.to_csv(os.path.join(res_dir, "p1_SNP_x_TFBS_matrix.tsv"), sep="\t", header=True, index=True) snp_tfbs_map = convert_to_map(snp_tfbs_matrix, map_colnames=["name", "symbol"]) snp_tfbs_map.to_csv(os.path.join(res_dir, "p2_SNP_x_TFBS_map.tsv"), sep="\t", header=True, index=False)
import os import pandas as pd from util_path import get_path from util_edge import remove_duplicated_undirected_edges if __name__ == "__main__": input_dir = get_path("resource/BioGRID") edgelist_df = pd.read_csv( os.path.join(input_dir, "BIOGRID-Human.tab2.tsv"), sep="\t", usecols=["Entrez Gene Interactor A", "Entrez Gene Interactor B"]) edgelist_df = remove_duplicated_undirected_edges(edgelist_df, sort=True) output_dir = get_path("edge/gene-gene") edgelist_df.to_csv(os.path.join(output_dir, "BioGRID.edgelist"), sep="\t", index=False, header=False)
import os import pandas as pd from util_path import get_path _dei_dir = get_path("resource/Util_DEI") _dei_set = set( pd.read_csv(os.path.join(_dei_dir, "discontinued_entrez_id.tsv"), sep="\t", usecols=["Discontinued_GeneID"], dtype={ "Discontinued_GeneID": int }).Discontinued_GeneID) def filter_dei(df, id_col): global _dei_set found_dei = set(df[id_col]).intersection(_dei_set) if found_dei: print("[util_dei] Found Disctinued Entrez ID: {}".format(found_dei)) flag_dei = df[id_col].isin(found_dei) return df.loc[~flag_dei, :]
import os import pandas as pd from util_path import get_path from util_edge import remove_duplicated_undirected_edges if __name__ == "__main__": input_dir = get_path("resource/HumanNet") edgelist_df = pd.read_csv(os.path.join(input_dir, "HumanNet-XN.tsv"), sep="\t", comment="#", names=["Gene_A", "Gene_B", "LLS"], usecols=["Gene_A", "Gene_B"]) edgelist_df = remove_duplicated_undirected_edges(edgelist_df, sort=True) output_dir = get_path("edge/gene-gene") edgelist_df.to_csv(os.path.join(output_dir, "HumanNet.edgelist"), sep="\t", index=False, header=False)
def convert_to_df(closest_obj): return pd.read_csv(StringIO(str(closest_obj)), sep="\t", header=None, names=[ "snpChrom", "snpChromStart", "snpChromEnd", "snpName", "geneChrom", "geneChromStart", "geneChromEnd", "geneName", "geneID", "geneStrand", "distance" ]) if __name__ == "__main__": snp_dir = get_path("vertex/SNP") snp_fn = "osu19_SNP.bed" snp_start_bed = make_snp_start_BED(os.path.join(snp_dir, snp_fn)) gene_dir = get_path("vertex/gene") ensembl_TSS_fn = "Ensembl_TSS.bed" ensembl_TSS_start_bed = make_TSS_start_BED( os.path.join(gene_dir, ensembl_TSS_fn)) entrez_TSS_fn = "Entrez_TSS.bed" entrez_TSS_start_bed = make_TSS_start_BED( os.path.join(gene_dir, entrez_TSS_fn)) snp_start_bt = convert_to_bedtool_obj(snp_start_bed) ensembl_TSS_start_bt = convert_to_bedtool_obj(ensembl_TSS_start_bed) entrez_TSS_start_bt = convert_to_bedtool_obj(entrez_TSS_start_bed)
import os import pandas as pd import mygene from util_path import get_path from util_dei import filter_dei res_dir = get_path("resource/Entrez") gene_dir = get_path("vertex/gene") mg = mygene.MyGeneInfo() def read_gene2ensembl(): global res_dir g2e_df = pd.read_csv(os.path.join(res_dir, "gene2ensembl_9606.tsv"), sep="\t", header=None, names=["Tax_ID", "Entrez_Gene_ID", "Ensembl_Gene_ID"]) unique_tax_ids = g2e_df.Tax_ID.unique() assert(len(unique_tax_ids) == 1) assert(unique_tax_ids[0] == 9606) g2e_df = g2e_df.drop("Tax_ID", axis=1).drop_duplicates().reindex() return g2e_df def read_biomart(GRCh): global res_dir if GRCh == "37": filename = "GRCh37_p13_mart_export.txt" elif GRCh == "38":