Beispiel #1
0
def main():
    # natural order: disease <-> target <-> chem
    # disease - chem is what is desired
    # disease - target is what is desired
    # http://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz

    results = pd.DataFrame()
    h_network = generate_heterogeneous_network(ppi_path, dg_path, dch_path,
                                               chg_file)

    for use_dge, dataset in itt.product([False, True],
                                        AD_DGE_DATASETS + NON_AD_DGE_DATASETS):
        disease_abv = dataset_to_disease_abv(dataset)
        dge_params = dge_params_ad if disease_abv == 'ad' else dge_params_dis
        do_id = disease_identifiers[disease_abv]
        logger.debug(
            f'Running for disease {disease_abv}, with the dataset {dataset}, '
            f'using the id {disease_identifiers[disease_abv]}')
        try:
            gene_list = parse_dge(
                dge_path=dge_file(dataset),
                entrez_id_header=dge_params['id'],
                log2_fold_change_header=dge_params['l2f'],
                adj_p_header=dge_params['adjp'],
                entrez_delimiter=split_char,
                base_mean_header=dge_params['mean'],
            )

            h_network_ds = deepcopy(h_network)
            h_network_ds.set_up_network(genes=gene_list)

            num_walks = gat2vec_config.num_walks
            walk_length = gat2vec_config.walk_length
            dimension = gat2vec_config.dimension
            window_size = gat2vec_config.window_size

            metrics_df = predict_links_cv(h_network_ds,
                                          use_dge=use_dge,
                                          dataset=dataset,
                                          num_walks=num_walks,
                                          walk_length=walk_length,
                                          dimension=dimension,
                                          window_size=window_size)
            results = results.append(metrics_df, ignore_index=True)
        except ValueError:
            logger.error(
                f'Dataset {dataset} ({do_id}) not found in the graph.')

    results.to_csv(os.path.join(g2v_path, 'results_df.tsv'), sep='\t')

    print("done")
Beispiel #2
0
def get_ppi_results(ppi_graph_path: str,
                    dataset: str,
                    evaluation: str = 'cv',
                    assoc_path: str = None,
                    phewas: str = None) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(dge_path=dge_file(dataset),
                          entrez_id_header=dge_params['id'],
                          log2_fold_change_header=dge_params['l2f'],
                          adj_p_header=dge_params['adjp'],
                          entrez_delimiter=split_char,
                          base_mean_header=dge_params['mean'],
                          csv_separator=';')
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
        disease_associations_path=phewas,
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)),
                              network)
    assoc_score = assoc_path and parse_association_scores(assoc_path)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
        assoc_score=assoc_score,
    )
    metrics_df, _ = rank_targets(directory=g2v_path,
                                 network=network,
                                 evaluation=evaluation)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['eval'] = evaluation
    df['dge'] = dataset
    return df
Beispiel #3
0
def get_bel_results(dataset) -> (pd.DataFrame, pd.DataFrame):
    """"""
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(dge_path=dge_file(dataset),
                          entrez_id_header=dge_params['id'],
                          log2_fold_change_header=dge_params['l2f'],
                          adj_p_header=dge_params['adjp'],
                          entrez_delimiter=split_char,
                          base_mean_header=dge_params['mean'],
                          csv_separator=';')
    network = generate_bel_network(
        bel_graph_path=bel_files,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
    )
    logger.info(f'Nodes {len(network.graph.nodes)}')
    targets = parse_gene_list(
        os.path.join(targets_base_path, dataset_to_disease_abv(dataset),
                     'ot_symbol.txt'), network)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
    )
    metrics_df, _ = rank_targets(directory=g2v_path,
                                 network=network,
                                 num_walks=30,
                                 walk_length=4,
                                 dimension=256,
                                 window_size=10)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['ds'] = 'bel'
    df['dge'] = dataset
    return df
Beispiel #4
0
def get_ppi_results(ppi_graph_path: str, dataset: str) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(
        dge_path=dge_file(dataset),
        entrez_id_header=dge_params['id'],
        log2_fold_change_header=dge_params['l2f'],
        adj_p_header=dge_params['adjp'],
        entrez_delimiter=split_char,
        base_mean_header=dge_params['mean'],
    )
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    logger.info(f'Edges {len(network.graph.es)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)),
                              network)
    logger.debug(
        f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
    )
    metrics_df, _ = rank_targets(directory=g2v_path, network=network)
    df = pd.DataFrame()
    df['auc'] = metrics_df['auc']
    df['aps'] = metrics_df['aps']
    df['ds'] = network_alias[os.path.basename(ppi_graph_path)]
    df['dge'] = dataset
    return df
Beispiel #5
0
def main():
    for dge in ['BM10', 'BM22', 'BM36', 'BM44']:

        dge_path = dge_base_path % dge

        gene_list = parse_dge(dge_path=dge_path,
                              entrez_id_header=entrez_id_name,
                              log2_fold_change_header=log_fold_change_name,
                              adj_p_header=adjusted_p_value_name,
                              entrez_delimiter=split_char,
                              base_mean_header=base_mean_name,
                              csv_separator=';')

        dim = len(graph_paths)
        fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False)
        fig.set_size_inches(10, 5)
        fig.suptitle(f'DGE {dge}')

        df = pd.DataFrame()

        axs_ind = 0
        for ppi_graph_path in graph_paths:
            max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1

            network = generate_ppi_network(
                ppi_graph_path=os.path.join(ppi_base_path, ppi_graph_path),
                dge_list=gene_list,
                max_adj_p=max_padj,
                max_log2_fold_change=max_log2_fold_change,
                min_log2_fold_change=min_log2_fold_change,
                ppi_edge_min_confidence=ppi_edge_min_confidence,
                current_disease_ids_path='',
                disease_associations_path=phewas_path,
            )

            targets = parse_gene_list(targets_file, network.graph)

            assoc_score = assoc_file and parse_association_scores(assoc_file)

            # File with no weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )

            df['rr'] = auc_df['auc']

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')

            df['bsvm'] = auc_df['auc']

            # File with weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path,
                                      assoc_score=assoc_score)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )

            df['wrr'] = auc_df['auc']

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')

            df['wbsvm'] = auc_df['auc']

            df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'],
                       ax=axs[0][axs_ind])

            axs[0][axs_ind].set_title(f'PPI {ppi_graph_path}"')
            axs_ind += 1
        fig.savefig(f'comparison_humanbase({dge}).png')
Beispiel #6
0
def main():
    for disease in DISEASE_ABBREVIATIONS:
        targets_file = targets_file_mask % disease
        assoc_file = assoc_file_mask % disease
        g2v_path = g2v_path_mask % disease
        dge_path = dge_path_mask % disease
        graph_paths = graphs_for_diseases[disease]

        gene_list = parse_dge(dge_path=dge_path,
                              entrez_id_header=entrez_id_name,
                              log2_fold_change_header=log_fold_change_name,
                              adj_p_header=adjusted_p_value_name,
                              entrez_delimiter=split_char,
                              base_mean_header=base_mean_name,
                              csv_separator=';')

        dim = len(graph_paths)
        fig, axs = plt.subplots(ncols=dim, sharey='all', squeeze=False)
        fig.set_size_inches(10, 5)
        fig.suptitle(f'DGE {disease}')

        df = pd.DataFrame()

        axs_ind = 0
        for ppi_graph_path, ppi_edge_min_confidence in itertools.product(
                graph_paths, confidence_cutoffs):
            max_log2_fold_change, min_log2_fold_change = lfc_cutoff, lfc_cutoff * -1

            network = generate_ppi_network(
                ppi_graph_path=os.path.join(base_ppi_path, ppi_graph_path),
                dge_list=gene_list,
                max_adj_p=max_padj,
                max_log2_fold_change=max_log2_fold_change,
                min_log2_fold_change=min_log2_fold_change,
                ppi_edge_min_confidence=ppi_edge_min_confidence,
                current_disease_ids_path='',
                disease_associations_path=phewas_path,
            )

            targets = parse_gene_list(targets_file, network.graph)

            assoc_score = assoc_file and parse_association_scores(assoc_file)

            # File with no weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )
            df['rr'] = auc_df['auc']
            logger.debug(f'types for {disease}')
            logger.debug(type(df['rr']))
            logger.debug(type(df))

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')
            df['bsvm'] = auc_df['auc']
            logger.debug(type(df['bsvm']))
            logger.debug(type(df))

            # File with weights
            write_gat2vec_input_files(network=network,
                                      targets=targets,
                                      home_dir=g2v_path,
                                      assoc_score=assoc_score)

            auc_df, _ = rank_targets(
                directory=g2v_path,
                network=network,
            )
            df['wrr'] = auc_df['auc']
            logger.debug(type(df['wrr']))
            logger.debug(type(df))

            auc_df, _ = rank_targets(directory=g2v_path,
                                     network=network,
                                     evaluation='svm',
                                     class_weights='balanced')
            df['wbsvm'] = auc_df['auc']
            logger.debug(type(df['wbsvm']))
            logger.debug(type(df))

            df.boxplot(column=['rr', 'wrr', 'bsvm', 'wbsvm'], ax=axs)

            axs[0][axs_ind].set_title(
                f'PPI {ppi_graph_path}", cutoff {ppi_edge_min_confidence}')
            axs_ind += 1
        fig.savefig(f'comparison_humanbase-{disease}.png')
def main():

    results_dict = defaultdict(list)
    h_network1 = generate_heterogeneous_network(ppi_path, dg_path, dch_path,
                                                chg_file)

    for use_dge, dataset in itt.product([False, True], AD_DGE_DATASETS[2] +
                                        NON_AD_DGE_DATASETS):
        disease_abv = dataset_to_disease_abv(dataset)
        do_id = disease_identifiers[disease_abv]
        dge_params = dge_params_ad if disease_abv == 'ad' else dge_params_dis

        logger.debug(
            f'Running for disease {disease_abv}, with the dataset {dataset}, using the id {do_id}'
        )
        try:
            gene_list = parse_dge(dge_path=dge_file(dataset),
                                  entrez_id_header=dge_params['id'],
                                  log2_fold_change_header=dge_params['l2f'],
                                  adj_p_header=dge_params['adjp'],
                                  entrez_delimiter=split_char,
                                  base_mean_header=dge_params['mean'],
                                  csv_separator=';')

            h_network = deepcopy(h_network1)
            h_network.set_up_network(genes=gene_list)
            h_network.write_gat2vec_input_files(
                home_dir=g2v_path,
                disease_id=do_id,
                use_dge_data=use_dge,
                filter_pleiotropic_targets=True)
            for i in range(10):
                auc, aps = predict_links(
                    g2v_path,
                    gat2vec_config.num_walks,
                    gat2vec_config.walk_length,
                    gat2vec_config.dimension,
                    gat2vec_config.window_size,
                )
                logger.debug(f'tr: {i}\t{round(auc, 3)}\t{round(aps, 3)}')
                results_dict['tr'].append(i)
                results_dict['auc'].append(auc)
                results_dict['aps'].append(aps)
                results_dict['dge'].append(dataset)
                results_dict['eval'].append(use_dge)
        except ValueError:
            logger.error(
                f'Dataset {dataset} ({do_id}) not found in the graph.')
    results = pd.DataFrame(results_dict)
    results.to_csv(os.path.join(g2v_path, 'results_df.tsv'), sep='\t')

    for key, datasets in DGE_DATASETS.items():
        for metric in ['auc', 'aps']:
            fig = sns.boxplot(data=results[results['dge'].isin(datasets)],
                              x='dge',
                              y='auc',
                              hue='eval').get_figure()
            fig.suptitle('Link Prediction.')
            fig.savefig(f'comp7_{key}_{metric}.png')
            plt.close()

        print("done")
def optimize_g2v_parameters(
    ppi_graph_path: str,
        dataset: str,
        evaluation: str = 'cv',
        assoc_path: str = None
) -> pd.DataFrame:
    dge_params = dge_params_ad if dataset in AD_DGE_DATASETS else dge_params_dis
    gene_list = parse_dge(
        dge_path=dge_file(dataset),
        entrez_id_header=dge_params['id'],
        log2_fold_change_header=dge_params['l2f'],
        adj_p_header=dge_params['adjp'],
        entrez_delimiter=split_char,
        base_mean_header=dge_params['mean'],
        csv_separator=';'
    )
    network = generate_ppi_network(
        ppi_graph_path=ppi_graph_path,
        dge_list=gene_list,
        max_adj_p=max_padj,
        max_log2_fold_change=max_log2_fold_change,
        min_log2_fold_change=min_log2_fold_change,
        ppi_edge_min_confidence=ppi_edge_min_confidence,
        current_disease_ids_path='',
        disease_associations_path=phewas_path,
    )

    logger.info(f'Nodes {len(network.graph.vs)}')
    targets = parse_gene_list(targets_file(dataset_to_disease_abv(dataset)), network)
    assoc_score = assoc_path and parse_association_scores(assoc_path)
    logger.debug(f'Number of targets being used for the network: {len(targets)}')

    write_gat2vec_input_files(
        network=network,
        targets=targets,
        home_dir=g2v_path,
        assoc_score=assoc_score
    )
    results = pd.DataFrame()
    for dim in [32, 64, 128, 256]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=dim,
        )
        df = pd.DataFrame()
        df['auc'] = metrics_df['auc']
        df['aps'] = metrics_df['aps']
        df['eval'] = str(dim)
        df['dge'] = dataset
        df['param'] = 'Dimension'
        logger.debug('df')
        logger.debug(df)
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Dimension', dim),
            ignore_index=True
        )
    """num_walks: int = 10
    walk_length: int = 80
    dimension: int = 128
    window_size: int = 5"""
    for nw in [6, 10, 20, 40, 80]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            num_walks=nw,
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Num Walks', nw),
            ignore_index=True
        )
    for wl in [20, 40, 80, 120, 160]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            walk_length=wl
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Walk Length', wl),
            ignore_index=True
        )
    for ws in [3, 5, 7, 10, 20, 40]:
        metrics_df, _ = rank_targets(
            directory=g2v_path,
            network=network,
            evaluation=evaluation,
            dimension=256,
            window_size=ws
        )
        results = results.append(
            assemble_results_df(metrics_df, dataset, 'Window Size', ws),
            ignore_index=True
        )
    logger.debug('results')
    logger.debug(results)
    return results
def main():
    # natural order: disease <-> target <-> chem
    # disease - chem is what is desired
    # disease - target is what is desired
    # http://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz

    results_dict = defaultdict(list)
    h_network1 = generate_heterogeneous_network(ppi_path, dg_path, dch_path,
                                                chg_file)

    for use_dge, dataset in itt.product([True],
                                        AD_DGE_DATASETS + NON_AD_DGE_DATASETS):
        disease_abv = dataset_to_disease_abv(dataset)
        do_id = disease_identifiers[disease_abv]
        dge_params = dge_params_ad if disease_abv == 'ad' else dge_params_dis
        logger.debug(
            f'Running for disease {disease_abv}, with the dataset {dataset}, using the id {do_id}'
        )
        try:
            gene_list = parse_dge(
                dge_path=dge_file(dataset),
                entrez_id_header=dge_params['id'],
                log2_fold_change_header=dge_params['l2f'],
                adj_p_header=dge_params['adjp'],
                entrez_delimiter=split_char,
                base_mean_header=dge_params['mean'],
            )

            h_network = deepcopy(h_network1)
            h_network.set_up_network(genes=gene_list)
            h_network.write_gat2vec_input_files(
                home_dir=g2v_path,
                disease_id=do_id,
                filter_pleiotropic_targets=True)

            # num_walks = gat2vec_config.num_walks
            walk_length = gat2vec_config.walk_length
            dimension = gat2vec_config.dimension
            window_size = gat2vec_config.window_size

            param = 'nw'
            for num_walks in [6, 10, 20, 40, 80]:
                start = time()
                lp_results = mp_predict_links(num_walks, walk_length,
                                              dimension, window_size)

                extract_results(results_dict, lp_results, dataset, param,
                                num_walks)
                logger.info(
                    f'Runtime for num_walks = {num_walks}: {time() - start}s')
            # best result from num_walks
            num_walks = gat2vec_config.num_walks

            param = 'wl'
            for walk_length in [20, 40, 80, 120, 160]:
                start = time()
                lp_results = mp_predict_links(num_walks, walk_length,
                                              dimension, window_size)

                extract_results(results_dict, lp_results, dataset, param,
                                walk_length)
                logger.info(
                    f'Runtime for walk_length = {walk_length}: {time() - start}s'
                )
            # best result from num_walks
            walk_length = gat2vec_config.walk_length

            param = 'ws'
            for window_size in [3, 5, 7, 10, 20, 40]:
                start = time()
                lp_results = mp_predict_links(num_walks, walk_length,
                                              dimension, window_size)

                extract_results(results_dict, lp_results, dataset, param,
                                window_size)
                logger.info(
                    f'Runtime for window_size = {window_size}: {time() - start}s'
                )
            # best result from num_walks
            window_size = gat2vec_config.window_size

            param = 'd'
            for dimension in [32, 64, 128, 256]:
                start = time()
                lp_results = mp_predict_links(num_walks, walk_length,
                                              dimension, window_size)

                extract_results(results_dict, lp_results, dataset, param,
                                dimension)
                logger.info(
                    f'Runtime for dimension = {dimension}: {time() - start}s')
        except ValueError:
            logger.error(
                f'Dataset {dataset} ({do_id}) not found in the graph.')

        results = pd.DataFrame(results_dict)
        # backup intermediate results in each iteration.
        results.to_csv(os.path.join(g2v_path, 'results_df_nonad.tsv'),
                       sep='\t')

    print("done")