Example #1
0
def get_dict_bop_genes(config, dict_bop_cpgs):
    dict_cpg_gene = get_dict_cpg_gene(config)
    dict_bop_genes = {}
    for bop in dict_bop_cpgs:
        cpgs = dict_bop_cpgs.get(bop)
        genes = []
        for curr_cpg in cpgs:
            curr_genes = dict_cpg_gene.get(curr_cpg)
            genes += curr_genes
        dict_bop_genes[bop] = list(set(genes))
    return dict_bop_genes
Example #2
0
def save_top_anova(config, num_top=500):
    attributes_dict = get_attributes_dict(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]

        curr_beta_dict = {}
        for key_age in attributes_dict:
            curr_beta_dict[key_age] = list(
                np.asarray(curr_vals)[attributes_dict[key_age]])

        anova_res = stats.f_oneway(*curr_beta_dict.values())
        pvals.append(anova_res.pvalue)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    genes_sorted = []
    pvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, pvals_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, pvals_genes])
    config.dt = DataType.cpg
Example #3
0
def get_raw_dict(config):
    dict_cpg_gene = get_dict_cpg_gene(config)
    dict_cpg_map = get_dict_cpg_map_info(config)
    attributes = get_attributes(config)
    cpgs, vals = load_cpg_data(config)

    cpg_non_inc = get_non_inc_cpgs(config)

    gene_raw_dict = {}
    map_dict = {}
    for id in range(0, len(cpgs)):

        curr_cpg = cpgs[id]
        curr_vals = vals[id]

        if curr_cpg not in cpg_non_inc:

            genes = dict_cpg_gene.get(curr_cpg)
            map_info = dict_cpg_map.get(curr_cpg)

            if genes is not None:
                for gene in genes:
                    if gene in gene_raw_dict:
                        for list_id in range(0, len(attributes)):
                            gene_raw_dict[gene][list_id].append(
                                curr_vals[list_id])
                        map_dict[gene].append(int(map_info))
                    else:
                        gene_raw_dict[gene] = []
                        for list_id in range(0, len(attributes)):
                            gene_raw_dict[gene].append([curr_vals[list_id]])
                        map_dict[gene] = []
                        map_dict[gene].append(int(map_info))

    for gene in gene_raw_dict:
        raw = gene_raw_dict[gene]
        map_info = map_dict[gene]
        order = np.argsort(map_info)
        gene_raw_dict[gene] = []
        for record in raw:
            sorted_record = list(np.array(record)[order])
            gene_raw_dict[gene].append(sorted_record)

    return gene_raw_dict
Example #4
0
def save_top_spearman(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    rhos = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        rho, pval = stats.spearmanr(attributes, curr_vals)
        rhos.append(rho)

    order = np.argsort(list(map(abs, rhos)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    rhos_sorted = list(np.array(rhos)[order])

    genes_sorted = []
    rhos_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        rho = rhos_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            genes_sorted.append(gene)
            rhos_genes.append(rho)

    cpgs_sorted = cpgs_sorted[0:num_top]
    rhos_sorted = rhos_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    rhos_genes = rhos_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, rhos_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, rhos_genes])
    config.dt = DataType.cpg
Example #5
0
def load_cpg_data(config):
    indexes = config.indexes
    dict_cpg_gene = get_dict_cpg_gene(config)

    fn = 'average_beta.txt'
    full_path = get_path(config, fn)
    f = open(full_path)
    for skip_id in range(0, config.num_skip_lines):
        skip_line = f.readline()

    num_lines = 0
    cpgs_passed = []
    vals_passed = []

    cpg_non_inc = get_non_inc_cpgs(config)

    for line in f:

        col_vals = line_proc(config, line)

        is_none = False
        if config.miss_tag in col_vals:
            is_none = True

        if not is_none:
            cpg = col_vals[0]
            vals = list(map(float, col_vals[1::]))
            vals = list(np.array(vals)[indexes])

            if cpg not in cpg_non_inc:
                if cpg in dict_cpg_gene:
                    vals_passed.append(vals)
                    cpgs_passed.append(cpg)

        num_lines += 1
        if num_lines % config.print_rate == 0:
            print('num_lines: ' + str(num_lines))

    f.close()

    return cpgs_passed, vals_passed
Example #6
0
def save_top_enet(config, num_bootstrap_runs=10, num_top=500):

    dict_cpg_gene = get_dict_cpg_gene(config)
    params_dict = load_params_dict(config)
    alpha = params_dict.get('alpha')
    l1_ratio = params_dict.get('l1_ratio')

    attributes = get_attributes(config)
    cpgs_passed, vals_passed = load_cpg_data(config)

    test_size = int(len(attributes) * config.test_part)
    train_size = len(attributes) - test_size
    rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size)
    indexes = np.linspace(0, len(attributes) - 1, len(attributes),
                          dtype=int).tolist()
    enet_X = np.array(vals_passed).T.tolist()

    bootstrap_id = 0
    cpg_top_dict = {}
    for train_index, test_index in rs.split(indexes):
        print('bootstrap_id: ' + str(bootstrap_id))

        enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        enet_X_train = list(np.array(enet_X)[train_index])
        enet_X_test = list(np.array(enet_X)[test_index])
        enet_y_train = list(np.array(attributes)[train_index])
        enet_y_test = list(np.array(attributes)[test_index])

        enet = enet.fit(enet_X_train, enet_y_train)
        coef = enet.coef_

        order = np.argsort(list(map(abs, coef)))[::-1]
        coef_sorted = list(np.array(coef)[order])
        cpg_sorted = list(np.array(cpgs_passed)[order])
        coef_top = coef_sorted[0:num_top]
        cpg_top = cpg_sorted[0:num_top]

        for top_id in range(0, num_top):
            cpg = cpg_top[top_id]
            if cpg in cpg_top_dict:
                cpg_top_dict[cpg] += 1
            else:
                cpg_top_dict[cpg] = 1

        bootstrap_id += 1

    cpgs = list(cpg_top_dict.keys())
    counts = list(cpg_top_dict.values())
    order = np.argsort(list(map(abs, counts)))[::-1]
    cpgs_sorted = list(np.array(cpgs)[order])
    counts_sorted = list(np.array(counts)[order])
    genes_sorted = []
    counts_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        count = counts_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                counts_genes.append(count)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [cpgs_sorted, counts_sorted])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_sorted, counts_genes])
    config.dt = DataType.cpg
Example #7
0
def save_top_linreg(config, num_top=500):
    attributes = get_attributes(config)
    dict_cpg_gene = get_dict_cpg_gene(config)
    cpgs, vals = load_cpg_data(config)

    slopes = []
    intercepts = []
    rvals = []
    pvals = []
    for id in range(0, len(cpgs)):
        curr_vals = vals[id]
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            curr_vals, attributes)
        slopes.append(slope)
        intercepts.append(intercept)
        rvals.append(r_value)
        pvals.append(p_value)

    order = np.argsort(pvals)
    cpgs_sorted = list(np.array(cpgs)[order])
    pvals_sorted = list(np.array(pvals)[order])
    slopes_sorted = list(np.array(slopes)[order])
    intercepts_sorted = list(np.array(intercepts)[order])
    rvals_sorted = list(np.array(rvals)[order])

    genes_sorted = []
    pvals_genes = []
    slopes_genes = []
    intercepts_genes = []
    rvals_genes = []
    for id in range(0, len(cpgs_sorted)):
        cpg = cpgs_sorted[id]
        pval = pvals_sorted[id]
        slope = slopes_sorted[id]
        intercept = intercepts_sorted[id]
        rval = rvals_sorted[id]
        genes = dict_cpg_gene.get(cpg)
        for gene in genes:
            if gene not in genes_sorted:
                genes_sorted.append(gene)
                pvals_genes.append(pval)
                slopes_genes.append(slope)
                intercepts_genes.append(intercept)
                rvals_genes.append(rval)

    cpgs_sorted = cpgs_sorted[0:num_top]
    pvals_sorted = pvals_sorted[0:num_top]
    slopes_sorted = slopes_sorted[0:num_top]
    intercepts_sorted = intercepts_sorted[0:num_top]
    rvals_sorted = rvals_sorted[0:num_top]

    genes_sorted = genes_sorted[0:num_top]
    pvals_genes = pvals_genes[0:num_top]
    slopes_genes = slopes_genes[0:num_top]
    intercepts_genes = intercepts_genes[0:num_top]
    rvals_genes = rvals_genes[0:num_top]

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted,
        intercepts_sorted
    ])

    config.approach_gd = GeneDataType.from_cpg
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [
        genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes
    ])
    config.dt = DataType.cpg
Example #8
0
def save_bend_linreg(config, limit, pval, num_opt=1000):
    config_less = deepcopy(config)
    age_less(config_less, limit)
    atr_l = get_attributes(config_less)
    cpg_names_l, cpg_vals_l = load_cpg_data(config_less)

    config_more = deepcopy(config)
    age_more(config_more, limit)
    atr_m = get_attributes(config_more)
    cpg_names_m, cpg_vals_m = load_cpg_data(config_more)

    cpg_gene_dict = get_dict_cpg_gene(config)

    cpgs_passed = []
    genes_passed = []

    angles = []

    slope_ls = []
    intercept_ls = []
    r_value_ls = []
    p_value_ls = []
    std_err_ls = []

    slope_ms = []
    intercept_ms = []
    r_value_ms = []
    p_value_ms = []
    std_err_ms = []

    num_cpgs = 0

    for cpg_id_l in range(0, len(cpg_names_l)):
        cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l])
        vals_l = cpg_vals_l[cpg_id_l]
        vals_m = cpg_vals_m[cpg_id_m]

        slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
            atr_l, vals_l)
        slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress(
            atr_m, vals_m)
        angle = abs(slope_l - slope_m)

        if (max(p_value_l, p_value_m) < pval):
            cpgs_passed.append(cpg_names_l[cpg_id_l])

            genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l])
            if len(genes) > 0:
                if genes[0] == '':
                    genes_passed.append('nan')
                else:
                    genes_passed.append(";".join(genes))
            else:
                genes_passed.append('nan')

            angles.append(angle)

            slope_ls.append(slope_l)
            intercept_ls.append(intercept_l)
            r_value_ls.append(r_value_l)
            p_value_ls.append(p_value_l)
            std_err_ls.append(std_err_l)

            slope_ms.append(slope_m)
            intercept_ms.append(intercept_m)
            r_value_ms.append(r_value_m)
            p_value_ms.append(p_value_m)
            std_err_ms.append(std_err_m)

        num_cpgs += 1
        if num_cpgs % config.print_rate == 0:
            print('num_cpgs: ' + str(num_cpgs))

    order = np.argsort(angles)[::-1][0:num_opt]

    cpgs_opt = list(np.array(cpgs_passed)[order])

    genes_opt = list(np.array(genes_passed)[order])

    angles_opt = list(np.array(angles)[order])

    slope_ls_opt = list(np.array(slope_ls)[order])
    intercept_ls_opt = list(np.array(intercept_ls)[order])
    r_value_ls_opt = list(np.array(r_value_ls)[order])
    p_value_ls_opt = list(np.array(p_value_ls)[order])
    std_err_ls_opt = list(np.array(std_err_ls)[order])

    slope_ms_opt = list(np.array(slope_ms)[order])
    intercept_ms_opt = list(np.array(intercept_ms)[order])
    r_value_ms_opt = list(np.array(r_value_ms)[order])
    p_value_ms_opt = list(np.array(p_value_ms)[order])
    std_err_ms_opt = list(np.array(std_err_ms)[order])

    fn = get_result_path(config, 'bend_' + str(limit) + '.txt')
    save_features(fn, [
        cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt,
        r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt,
        intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt
    ])

    raw_config = Config(db=config.db,
                        dt=config.dt,
                        approach=config.approach,
                        scenario=config.scenario,
                        approach_method=config.approach_method,
                        gender=Gender.any)

    cpg_str_list = []
    cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config)
    for cpg in cpgs_opt:
        cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)]
        curr_cpg_str = cpg
        for id in range(0, len(cpg_vals)):
            curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e')))
        cpg_str_list.append(curr_cpg_str)

    fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt')
    np.savetxt(fn, cpg_str_list, fmt="%s")