Beispiel #1
0
def filter_gene_expression_profile(gene_list_file_name,
                                   gene_expression_file_name,
                                   gene_filter_file_name=None,
                                   gene_list_path=None,
                                   gene_expression_path=None,
                                   gene_filter_path=None,
                                   source="GDC-TCGA",
                                   dataset="melanoma",
                                   by_gene=False):
    pd = load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv",
                             phenotype_list_path=None,
                             source="GDC-TCGA",
                             dataset="melanoma")
    pd_headers = pd[0]
    label_index = [i for i, v in enumerate(pd_headers) if v == LABEL_ID][0]
    invalid_labels = [
        cur[0] for i, cur in enumerate(pd) if cur[label_index] == METASTATIC
        or cur[label_index] == PRIMARY_TUMOR or i == 0
    ]

    gene_list = load_gene_list(gene_list_file_name=gene_list_file_name,
                               gene_list_path=gene_list_path,
                               source=source,
                               dataset=dataset)
    if gene_filter_file_name:
        filter_gene_list = load_gene_list(
            gene_list_file_name=gene_filter_file_name,
            gene_list_path=gene_filter_path,
            source=source,
            dataset=dataset)
        gene_list = [
            cur for cur in gene_list if cur in filter_gene_list
            or cur[:cur.find('.')] in filter_gene_list
        ]

    if gene_expression_path == None:
        gene_expression_path = os.path.join(BASE_PROFILE, source, dataset,
                                            gene_expression_file_name)
    f = open(gene_expression_path, 'r')
    expression_profiles_filtered = [
        l for i, l in enumerate(f) if i == 0 or any([
            l.strip()[0:l.strip().find('\t')] in gene_list
            or l.strip()[0:l.strip().find('\t')].split(".")[0] in gene_list
        ])
    ]
    f.close()
    expression_profiles_filtered_out = []
    expression_headers = expression_profiles_filtered[0].split()
    for cur in expression_profiles_filtered:
        splited = cur.split("\t")
        splited = [
            cur for i, cur in enumerate(splited)
            if expression_headers[i] not in invalid_labels
        ]
        expression_profiles_filtered_out.append("\t".join(splited))

    f = open(gene_expression_path + "_filtered", 'w+')
    f.writelines(expression_profiles_filtered)
    f.close()
Beispiel #2
0
def fetch_string_ppi_edges():
    go_edges = {}
    grid_len = 0
    if constants.USE_CACHE:
        if os.path.isfile(
                os.path.join(constants.DICTIONARIES_DIR,
                             "GO_edges_ppi_total.txt")):
            print "about to load ppi"
            GO_edges_ppi_grid = infra.load_phenotype_data(
                "GO_edges_ppi_total.txt",
                phenotype_list_path=constants.DICTIONARIES_DIR)
            grid_len = len(GO_edges_ppi_grid)
            print "done load ppi ({} lines). about to load to dict".format(
                grid_len)
            for cur in GO_edges_ppi_grid:
                go_edges[cur[0]] = int(cur[1])
            print "done load to dict"
            return go_edges

    print "fetching ensg"
    ensg_dict = get_ensg_dict()
    print "fetching ensp"
    ensp_dict = get_ensp_dict()
    print "fetching string ppi"
    string_ppi_dict = get_string_ppi_dict()
    go_edges = {}
    count = 0
    for cur_edge, cur_score in string_ppi_dict.iteritems():
        count += 1
        print count
        vertices = cur_edge.split("=")
        if not ensp_dict.has_key(vertices[0]) or not ensp_dict.has_key(
                vertices[1]):
            continue

        go_src = ensp_dict[vertices[0]]["GO Terms"]
        go_dst = ensp_dict[vertices[1]]["GO Terms"]

        for cur_src in go_src:
            for cur_dst in go_dst:
                edge = "{}={}".format(cur_src, cur_dst)
                edge_alt = "{}={}".format(cur_dst, cur_src)
                if go_edges.has_key(edge):
                    go_edges[edge] += int(cur_score)
                elif go_edges.has_key(edge_alt):
                    go_edges[edge_alt] += int(cur_score)
                else:
                    go_edges[edge] = int(cur_score)
    with file(
            os.path.join(constants.OUTPUT_GLOBAL_DIR,
                         "GO_edges_ppi_total.txt"), "w+") as f:
        count = 0
        for k, v in go_edges.iteritems():
            count += 1
            print "{}/{}".format(count, grid_len)
            f.write("{}\t{}\n".format(k, v))

    return go_edges
from infra import load_phenotype_data
from infra import load_gene_expression_profile_by_genes

SEPARATOR = "@%@"

LABEL_ID = "sample_type.samples"
PRIMARY_TUMOR = "Primary Tumor"
METASTATIC = "Metastatic"
pd = load_phenotype_data("TCGA-SKCM.GDC_phenotype.tsv",
                         phenotype_list_path=None,
                         source="GDC-TCGA",
                         dataset="melanoma")
# pd = np.flip(np.rot90(pd, k=1, axes=(1, 0)), 1)
epd = load_gene_expression_profile_by_genes("protein_coding.txt",
                                            "TCGA-SKCM.htseq_counts.tsv",
                                            gene_filter_file_name=None,
                                            gene_list_path=None,
                                            gene_expression_path=None,
                                            gene_filter_path=None,
                                            source="GDC-TCGA",
                                            dataset="melanoma")
pd_headers = pd[0]
label_index = [i for i, v in enumerate(pd_headers) if v == LABEL_ID][0]
pd = sorted(filter(lambda i: i[0] in epd[0], pd),
            key=lambda i: epd[0].index(i[0]))

# for i, cur in enumerate(pd):
#     print pd[i]
#     print epd[0][i]

map = {
Beispiel #4
0
    if not os.path.exists(association_file_location):
        wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL,
                      os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME))

    print "Loading gene-GO associations"
    # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed?
    go2geneids = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True)
    geneids2go = read_ncbi_gene2go(association_file_location, taxids=[9606])

    return (go2geneids, geneids2go)

def fetch_string_ppi_edges():
    go_edges = {}
            if constants.USE_CACHE:
        if os.path.isfile(os.path.join(constants.DICTIONARIES_DIR,"GO_edges_ppi_total.txt")):
            GO_edges_ppi_grid = infra.load_phenotype_data("GO_edges_ppi_total.txt",phenotype_list_path=constants.DICTIONARIES_DIR)
            for cur in GO_edges_ppi_grid:
                go_edges[cur[0]] = int(cur[1])
            return go_edges

    print "fetching ensg"
    ensg_dict = get_ensg_dict()
    print "fetching ensp"
    ensp_dict = get_ensp_dict()
    print "fetching string ppi"
    string_ppi_dict = get_string_ppi_dict()
    go_edges = {}
    count = 0
    for cur_edge, cur_score in string_ppi_dict.iteritems():
        count +=1
        print count