def filter_gene_expression_profile(gene_list_file_name, gene_expression_file_name, gene_filter_file_name=None, gene_list_path=None, gene_expression_path=None, gene_filter_path=None, source="GDC-TCGA", dataset="melanoma", by_gene=False): pd = load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv", phenotype_list_path=None, source="GDC-TCGA", dataset="melanoma") pd_headers = pd[0] label_index = [i for i, v in enumerate(pd_headers) if v == LABEL_ID][0] invalid_labels = [ cur[0] for i, cur in enumerate(pd) if cur[label_index] == METASTATIC or cur[label_index] == PRIMARY_TUMOR or i == 0 ] gene_list = load_gene_list(gene_list_file_name=gene_list_file_name, gene_list_path=gene_list_path, source=source, dataset=dataset) if gene_filter_file_name: filter_gene_list = load_gene_list( gene_list_file_name=gene_filter_file_name, gene_list_path=gene_filter_path, source=source, dataset=dataset) gene_list = [ cur for cur in gene_list if cur in filter_gene_list or cur[:cur.find('.')] in filter_gene_list ] if gene_expression_path == None: gene_expression_path = os.path.join(BASE_PROFILE, source, dataset, gene_expression_file_name) f = open(gene_expression_path, 'r') expression_profiles_filtered = [ l for i, l in enumerate(f) if i == 0 or any([ l.strip()[0:l.strip().find('\t')] in gene_list or l.strip()[0:l.strip().find('\t')].split(".")[0] in gene_list ]) ] f.close() expression_profiles_filtered_out = [] expression_headers = expression_profiles_filtered[0].split() for cur in expression_profiles_filtered: splited = cur.split("\t") splited = [ cur for i, cur in enumerate(splited) if expression_headers[i] not in invalid_labels ] expression_profiles_filtered_out.append("\t".join(splited)) f = open(gene_expression_path + "_filtered", 'w+') f.writelines(expression_profiles_filtered) f.close()
def fetch_string_ppi_edges(): go_edges = {} grid_len = 0 if constants.USE_CACHE: if os.path.isfile( os.path.join(constants.DICTIONARIES_DIR, "GO_edges_ppi_total.txt")): print "about to load ppi" GO_edges_ppi_grid = infra.load_phenotype_data( "GO_edges_ppi_total.txt", phenotype_list_path=constants.DICTIONARIES_DIR) grid_len = len(GO_edges_ppi_grid) print "done load ppi ({} lines). about to load to dict".format( grid_len) for cur in GO_edges_ppi_grid: go_edges[cur[0]] = int(cur[1]) print "done load to dict" return go_edges print "fetching ensg" ensg_dict = get_ensg_dict() print "fetching ensp" ensp_dict = get_ensp_dict() print "fetching string ppi" string_ppi_dict = get_string_ppi_dict() go_edges = {} count = 0 for cur_edge, cur_score in string_ppi_dict.iteritems(): count += 1 print count vertices = cur_edge.split("=") if not ensp_dict.has_key(vertices[0]) or not ensp_dict.has_key( vertices[1]): continue go_src = ensp_dict[vertices[0]]["GO Terms"] go_dst = ensp_dict[vertices[1]]["GO Terms"] for cur_src in go_src: for cur_dst in go_dst: edge = "{}={}".format(cur_src, cur_dst) edge_alt = "{}={}".format(cur_dst, cur_src) if go_edges.has_key(edge): go_edges[edge] += int(cur_score) elif go_edges.has_key(edge_alt): go_edges[edge_alt] += int(cur_score) else: go_edges[edge] = int(cur_score) with file( os.path.join(constants.OUTPUT_GLOBAL_DIR, "GO_edges_ppi_total.txt"), "w+") as f: count = 0 for k, v in go_edges.iteritems(): count += 1 print "{}/{}".format(count, grid_len) f.write("{}\t{}\n".format(k, v)) return go_edges
from infra import load_phenotype_data from infra import load_gene_expression_profile_by_genes SEPARATOR = "@%@" LABEL_ID = "sample_type.samples" PRIMARY_TUMOR = "Primary Tumor" METASTATIC = "Metastatic" pd = load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv", phenotype_list_path=None, source="GDC-TCGA", dataset="melanoma") # pd = np.flip(np.rot90(pd, k=1, axes=(1, 0)), 1) epd = load_gene_expression_profile_by_genes("protein_coding.txt", "TCGA-SKCM.htseq_counts.tsv", gene_filter_file_name=None, gene_list_path=None, gene_expression_path=None, gene_filter_path=None, source="GDC-TCGA", dataset="melanoma") pd_headers = pd[0] label_index = [i for i, v in enumerate(pd_headers) if v == LABEL_ID][0] pd = sorted(filter(lambda i: i[0] in epd[0], pd), key=lambda i: epd[0].index(i[0])) # for i, cur in enumerate(pd): # print pd[i] # print epd[0][i] map = {
if not os.path.exists(association_file_location): wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)) print "Loading gene-GO associations" # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed? go2geneids = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True) geneids2go = read_ncbi_gene2go(association_file_location, taxids=[9606]) return (go2geneids, geneids2go) def fetch_string_ppi_edges(): go_edges = {} if constants.USE_CACHE: if os.path.isfile(os.path.join(constants.DICTIONARIES_DIR,"GO_edges_ppi_total.txt")): GO_edges_ppi_grid = infra.load_phenotype_data("GO_edges_ppi_total.txt",phenotype_list_path=constants.DICTIONARIES_DIR) for cur in GO_edges_ppi_grid: go_edges[cur[0]] = int(cur[1]) return go_edges print "fetching ensg" ensg_dict = get_ensg_dict() print "fetching ensp" ensp_dict = get_ensp_dict() print "fetching string ppi" string_ppi_dict = get_string_ppi_dict() go_edges = {} count = 0 for cur_edge, cur_score in string_ppi_dict.iteritems(): count +=1 print count