Ejemplo n.º 1
0
def test_topology_total_degree(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None):
    """
    Performs the analysis of total degree of the .

    It computes a p-value for the ratio of total degree of the geneset being bigger than the one expected by chance
    for a geneset of the same size.
    """
    logging.info("Evaluating the test topology total degree, please wait")
    network = rc.ReadTsv(network_file).get_network()
    data = rc.ReadTxt(geneset_file).get_data()
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    setnames = [key for key in geneset.keys()]

    # Generate output
    output1 = out.Output(network_file, output_table, "topology_total_degree",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)

    # Create table
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_total_degree_statistic,
                                 network,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():
        # Geneset smaller than size cut are not taken into consideration
        if len(item) > size_cut:
            item = set(item)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s removed from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                # TODO Check line below
                logging.info("Null mean: %g null variance: %g".format(
                    np.mean(null_d), np.var(null_d)))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_total_degree_null_distribution.pdf',
                        setname=setname)
    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
    logging.info("Test topology total degree completed")
Ejemplo n.º 2
0
def test_topology_sp(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    distance_matrix_filename: "distance hdf5 matrix file generated by pygna",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs geneset network topology shortest path analysis.

    It computes a p-value for the average shortest path length
    of the geneset being smaller than expected by chance
    for a geneset of the same size.
    """

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))

    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    diz = {
        "nodes":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[0],
        "matrix":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[1]
    }
    diz["matrix"] = diz["matrix"] + np.transpose(diz["matrix"])
    np.fill_diagonal(diz["matrix"], float("inf"))
    setnames = [key for key in geneset.keys()]

    output1 = out.Output(network_file, output_table, "topology_sp",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_localisation_statistic,
                                 network,
                                 diz,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():

        item = set(item)
        if len(item) > size_cut:
            logging.info("Setname:" + setname)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item, cores=cores, max_iter=number_of_permutations)
            logging.info("Observed: %g p-value: %g" % (observed, pvalue))

            output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                              number_of_permutations, observed,
                                              pvalue, np.mean(null_d),
                                              np.var(null_d))
            if diagnostic_null_folder:
                diagnostic.plot_null_distribution(null_d,
                                                  observed,
                                                  diagnostic_null_folder +
                                                  setname +
                                                  '_sp_null_distribution.pdf',
                                                  setname=setname,
                                                  alternative="less")
        else:
            logging.info("%s remove from results since nodes mapped are < %d" %
                         (setname, size_cut))
    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='less')
Ejemplo n.º 3
0
def test_topology_rwr(
    network_file: "network file, use a network with weights",
    geneset_file: "GMT geneset file",
    rwr_matrix_filename: "hdf5 RWR matrix obtained with pygna ",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs the analysis of random walk probabilities.
    Given the RWR matrix, it compares the probability of walking between the genes in the geneset compared to
    those of walking between the nodes of a geneset with the same size
    """

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)
    rw_dict = {
        "nodes": read_distance_matrix(rwr_matrix_filename,
                                      in_memory=in_memory)[0],
        "matrix": read_distance_matrix(rwr_matrix_filename,
                                       in_memory=in_memory)[1]
    }

    setnames = [key for key in geneset.keys()]
    output1 = out.Output(network_file, output_table, "topology_rwr",
                         geneset_file, setnames)

    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_RW_statistic,
                                 network,
                                 rw_dict,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():
        item = set(item)
        if len(item) > size_cut:
            # test
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s remove from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_rwr_null_distribution.pdf',
                        setname=setname)
                # saving output
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
        else:
            logging.info(
                "%s removed from results since nodes mapped are < %d" %
                (setname, size_cut))

    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
Ejemplo n.º 4
0
def test_topology_module(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    output_lcc:
    "for creating a GMT file with the LCC lists pass a GMT filename" = None,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs geneset network topology module analysis.
    It computes a p-value for the largest connected component of the geneset being bigger than the one expected by chance
    for a geneset of the same size.
    """
    network = rc.ReadTsv(network_file).get_network()
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    setnames = [key for key in geneset.keys()]
    output1 = out.Output(network_file, output_table, "topology_module",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()

    st_test = st.StatisticalTest(st.geneset_module_statistic,
                                 network,
                                 degree_bins=n_bins)
    for setname, item in geneset.items():
        item = set(item)
        if len(item) > size_cut:
            if output_lcc:
                module = nx.subgraph(network, item)
                if len(module.nodes) > 0:
                    lcc = sorted(list(nx.connected_components(module)),
                                 key=len,
                                 reverse=True)[0]
                else:
                    lcc = []
                output1.add_GMT_entry(setname, "topology_module", lcc)

            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s remove from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_module_null_distribution.pdf',
                        setname=setname)
    output1.close_temporary_table()
    if output_lcc:
        output1.create_GMT_output(output_lcc)

    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
Ejemplo n.º 5
0
def test_topology_centrality(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    distance_matrix_filename: "The matrix with the SP for each node",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: 'load hdf5 data onto memory' = False,
):
    """
    This function calculates the average closeness centrality of a geneset.
    For a single node, the closeness centrality is defined as the inverse
    of the shortest path distance of the node from all the other nodes.

    """

    logging.info("Evaluating the test topology total degree, please wait")
    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)
    setnames = [key for key in geneset.keys()]

    diz = {
        "nodes":
        cmd.read_distance_matrix(distance_matrix_filename,
                                 in_memory=in_memory)[0],
        "matrix":
        cmd.read_distance_matrix(distance_matrix_filename,
                                 in_memory=in_memory)[1]
    }
    diz["matrix"] = diz["matrix"] + np.transpose(diz["matrix"])

    np.fill_diagonal(diz["matrix"], float(0))

    diz['vector'] = np.sum(diz["matrix"], axis=0)

    # Generate output
    output1 = out.Output(network_file, output_table, "topology_centrality",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    # Create table
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(average_closeness_centrality, network, diz)

    for setname, item in geneset.items():
        # Geneset smaller than size cut are not taken into consideration
        if len(item) > size_cut:
            item = set(item)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s removed from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))

    output1.close_temporary_table()

    logging.info("Test topology CENTRALITY completed")