Beispiel #1
0
def get_ppi_results(ppi_graph_path: str,
                    dataset: str,
                    evaluation: str = 'cv',
                    assoc_path: str = None,
                    phewas: str = None) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(dge_path=dge_file(dataset),
                          entrez_id_header=dge_params['id'],
                          log2_fold_change_header=dge_params['l2f'],
                          adj_p_header=dge_params['adjp'],
                          entrez_delimiter=split_char,
                          base_mean_header=dge_params['mean'],
                          csv_separator=';')
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
        disease_associations_path=phewas,
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)),
                              network)
    assoc_score = assoc_path and parse_association_scores(assoc_path)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
        assoc_score=assoc_score,
    )
    metrics_df, _ = rank_targets(directory=g2v_path,
                                 network=network,
                                 evaluation=evaluation)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['eval'] = evaluation
    df['dge'] = dataset
    return df
Beispiel #2
0
def get_bel_results(dataset) -> (pd.DataFrame, pd.DataFrame):
    """"""
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(dge_path=dge_file(dataset),
                          entrez_id_header=dge_params['id'],
                          log2_fold_change_header=dge_params['l2f'],
                          adj_p_header=dge_params['adjp'],
                          entrez_delimiter=split_char,
                          base_mean_header=dge_params['mean'],
                          csv_separator=';')
    network = generate_bel_network(
        bel_graph_path=bel_files,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
    )
    logger.info(f'Nodes {len(network.graph.nodes)}')
    targets = parse_gene_list(
        os.path.join(targets_base_path, dataset_to_disease_abv(dataset),
                     'ot_symbol.txt'), network)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
    )
    metrics_df, _ = rank_targets(directory=g2v_path,
                                 network=network,
                                 num_walks=30,
                                 walk_length=4,
                                 dimension=256,
                                 window_size=10)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['ds'] = 'bel'
    df['dge'] = dataset
    return df
Beispiel #3
0
def get_ppi_results(ppi_graph_path: str, dataset: str) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(
        dge_path=dge_file(dataset),
        entrez_id_header=dge_params['id'],
        log2_fold_change_header=dge_params['l2f'],
        adj_p_header=dge_params['adjp'],
        entrez_delimiter=split_char,
        base_mean_header=dge_params['mean'],
    )
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    logger.info(f'Edges {len(network.graph.es)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)),
                              network)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
    )
    metrics_df, _ = rank_targets(directory=g2v_path, network=network)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['ds'] = network_alias[os.path.basename(ppi_graph_path)]
    df['dge'] = dataset
    return df
Beispiel #4
0
def main():
    for dge in ['BM10', 'BM22', 'BM36', 'BM44']:

        dge_path = dge_base_path % dge

        gene_list = parse_dge(dge_path=dge_path,
                              entrez_id_header=entrez_id_name,
                              log2_fold_change_header=log_fold_change_name,
                              adj_p_header=adjusted_p_value_name,
                              entrez_delimiter=split_char,
                              base_mean_header=base_mean_name,
                              csv_separator=';')

        dim = len(graph_paths)
        fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False)
        fig.set_size_inches(10, 5)
        fig.suptitle(f'DGE {dge}')

        df = pd.DataFrame()

        axs_ind = 0
        for ppi_graph_path in graph_paths:
            max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1

            network = generate_ppi_network(
                ppi_graph_path=os.path.join(ppi_base_path, ppi_graph_path),
                dge_list=gene_list,
                max_adj_p=max_padj,
                max_log2_fold_change=max_log2_fold_change,
                min_log2_fold_change=min_log2_fold_change,
                ppi_edge_min_confidence=ppi_edge_min_confidence,
                current_disease_ids_path='',
                disease_associations_path=phewas_path,
            )

            targets = parse_gene_list(targets_file, network.graph)

            assoc_score = assoc_file and parse_association_scores(assoc_file)

            # File with no weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )

            df['rr'] = auc_df['auc']

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')

            df['bsvm'] = auc_df['auc']

            # File with weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path,
                                      assoc_score=assoc_score)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )

            df['wrr'] = auc_df['auc']

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')

            df['wbsvm'] = auc_df['auc']

            df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'],
                       ax=axs[0][axs_ind])

            axs[0][axs_ind].set_title(f'PPI {ppi_graph_path}"')
            axs_ind += 1
        fig.savefig(f'comparison_humanbase({dge}).png')
Beispiel #5
0
def main():
    for disease in DISEASE_ABBREVIATIONS:
        targets_file = targets_file_mask % disease
        assoc_file = assoc_file_mask % disease
        g2v_path = g2v_path_mask % disease
        dge_path = dge_path_mask % disease
        graph_paths = graphs_for_diseases[disease]

        gene_list = parse_dge(dge_path=dge_path,
                              entrez_id_header=entrez_id_name,
                              log2_fold_change_header=log_fold_change_name,
                              adj_p_header=adjusted_p_value_name,
                              entrez_delimiter=split_char,
                              base_mean_header=base_mean_name,
                              csv_separator=';')

        dim = len(graph_paths)
        fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False)
        fig.set_size_inches(10, 5)
        fig.suptitle(f'DGE {disease}')

        df = pd.DataFrame()

        axs_ind = 0
        for ppi_graph_path, ppi_edge_min_confidence in itertools.product(
                graph_paths, confidence_cutoffs):
            max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1

            network = generate_ppi_network(
                ppi_graph_path=os.path.join(base_ppi_path, ppi_graph_path),
                dge_list=gene_list,
                max_adj_p=max_padj,
                max_log2_fold_change=max_log2_fold_change,
                min_log2_fold_change=min_log2_fold_change,
                ppi_edge_min_confidence=ppi_edge_min_confidence,
                current_disease_ids_path='',
                disease_associations_path=phewas_path,
            )

            targets = parse_gene_list(targets_file, network.graph)

            assoc_score = assoc_file and parse_association_scores(assoc_file)

            # File with no weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )
            df['rr'] = auc_df['auc']
            logger.debug(f'types for {disease}')
            logger.debug(type(df['rr']))
            logger.debug(type(df))

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')
            df['bsvm'] = auc_df['auc']
            logger.debug(type(df['bsvm']))
            logger.debug(type(df))

            # File with weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path,
                                      assoc_score=assoc_score)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )
            df['wrr'] = auc_df['auc']
            logger.debug(type(df['wrr']))
            logger.debug(type(df))

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')
            df['wbsvm'] = auc_df['auc']
            logger.debug(type(df['wbsvm']))
            logger.debug(type(df))

            df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'], ax=axs)

            axs[0][axs_ind].set_title(
                f'PPI {ppi_graph_path}", cutoff {ppi_edge_min_confidence}')
            axs_ind += 1
        fig.savefig(f'comparison_humanbase-{disease}.png')
def optimize_g2v_parameters(
    ppi_graph_path: str,
        dataset: str,
        evaluation: str = 'cv',
        assoc_path: str = None
) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(
        dge_path=dge_file(dataset),
        entrez_id_header=dge_params['id'],
        log2_fold_change_header=dge_params['l2f'],
        adj_p_header=dge_params['adjp'],
        entrez_delimiter=split_char,
        base_mean_header=dge_params['mean'],
        csv_separator=';'
    )
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
        disease_associations_path=phewas_path,
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)), network)
    assoc_score = assoc_path and parse_association_scores(assoc_path)
    logger.debug(f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
        assoc_score=assoc_score
    )
    results = pd.DataFrame()
    for dim in [32, 64, 128, 256]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=dim,
        )
        df = pd.DataFrame()
        df['auc'] = metrics_df['auc']
        df['aps'] = metrics_df['aps']
        df['eval'] = str(dim)
        df['dge'] = dataset
        df['param'] = 'Dimension'
        logger.debug('df')
        logger.debug(df)
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Dimension', dim),
            ignore_index=True
        )
    """num_walks: int = 10
    walk_length: int = 80
    dimension: int = 128
    window_size: int = 5"""
    for nw in [6, 10, 20, 40, 80]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            num_walks=nw,
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Num Walks', nw),
            ignore_index=True
        )
    for wl in [20, 40, 80, 120, 160]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            walk_length=wl
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Walk Length', wl),
            ignore_index=True
        )
    for ws in [3, 5, 7, 10, 20, 40]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            window_size=ws
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Window Size', ws),
            ignore_index=True
        )
    logger.debug('results')
    logger.debug(results)
    return results