예제 #1
0
    def test_multi_index_df_to_component_dfs(self):
        mi_df_index = pd.MultiIndex.from_arrays(
            [["D", "E"], [-666, -666], ["dd", "ee"]],
            names=["rid", "rhd1", "rhd2"])
        mi_df_columns = pd.MultiIndex.from_arrays(
            [["A", "B", "C"], [1, 2, 3], ["Z", "Y", "X"]],
            names=["cid", "chd1", "chd2"])
        mi_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                             index=mi_df_index,
                             columns=mi_df_columns)

        e_row_metadata_df = pd.DataFrame([[-666, "dd"], [-666, "ee"]],
                                         index=pd.Index(["D", "E"],
                                                        name="rid"),
                                         columns=pd.Index(["rhd1", "rhd2"],
                                                          name="rhd"))
        e_col_metadata_df = pd.DataFrame([[1, "Z"], [2, "Y"], [3, "X"]],
                                         index=pd.Index(["A", "B", "C"],
                                                        name="cid"),
                                         columns=pd.Index(["chd1", "chd2"],
                                                          name="chd"))
        e_data_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                                 index=pd.Index(["D", "E"], name="rid"),
                                 columns=pd.Index(["A", "B", "C"], name="cid"))

        (data_df, row_df,
         col_df) = GCToo.multi_index_df_to_component_dfs(mi_df)

        self.assertTrue(col_df.equals(e_col_metadata_df))
        self.assertTrue(row_df.equals(e_row_metadata_df))
        self.assertTrue(data_df.equals(e_data_df))

        # edge case: if the index (or column) of the multi-index has only one
        # level, it becomes a regular index
        mi_df_index_plain = pd.MultiIndex.from_arrays([["D", "E"]],
                                                      names=["rid"])
        mi_df2 = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                              index=mi_df_index_plain,
                              columns=mi_df_columns)

        # row df should be empty
        e_row_df2 = pd.DataFrame(index=["D", "E"])

        (data_df2, row_df2,
         col_df2) = GCToo.multi_index_df_to_component_dfs(mi_df2)
        self.assertTrue(row_df2.equals(e_row_df2))
        self.assertTrue(col_df2.equals(e_col_metadata_df))
        self.assertTrue(data_df2.equals(e_data_df))
예제 #2
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True)

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_df, bg_df) = prepare_multi_index_dfs(
        test_gct.multi_index_df, bg_gct.multi_index_df,
        args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct,
        QUERY_FIELD_NAME,
        TARGET_FIELD_NAME,
        args.separator)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df)

    # Compute connectivity
    (conn_mi_df, signed_conn_mi_df) = compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        args.connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = (
        GCToo.multi_index_df_to_component_dfs(
            signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME))

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)

    # Create gct and write it to file
    conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df)
    wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
예제 #3
0
def do_steep_and_sip(external_gct, internal_gct, bg_gct, similarity_metric,
                     connectivity_metric,
                     fields_to_aggregate_for_external_profiles,
                     fields_to_aggregate_for_internal_profiles):

    #----------STEEP----------#

    # Compute similarity between external and internal profiles
    sim_df = steep.compute_similarity_bw_two_dfs(internal_gct.data_df,
                                                 external_gct.data_df,
                                                 similarity_metric)

    # Row metadata is from gct1, column metadata is from gct2
    row_metadata_for_sim_df = internal_gct.col_metadata_df
    col_metadata_for_sim_df = external_gct.col_metadata_df

    # Append column to both metadata_dfs indicating which similarity_metric was used
    row_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric
    col_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric

    # Assemble similarity gct
    sim_gct = GCToo.GCToo(sim_df,
                          row_metadata_for_sim_df,
                          col_metadata_for_sim_df,
                          make_multiindex=True)

    #----------SIP----------#

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_df, bg_df) = sip.prepare_multi_index_dfs(
        sim_gct.multi_index_df, bg_gct.multi_index_df,
        fields_to_aggregate_for_external_profiles,
        fields_to_aggregate_for_internal_profiles,
        fields_to_aggregate_for_internal_profiles, QUERY_FIELD_NAME,
        TARGET_FIELD_NAME, SEPARATOR)

    # Check symmetry
    (is_test_df_sym,
     is_bg_df_sym) = sip.check_symmetry(sim_gct.multi_index_df,
                                        bg_gct.multi_index_df)

    # Compute connectivity
    (conn_mi_df, signed_conn_mi_df) = sip.compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df,
     signed_col_metadata_df) = GCToo.multi_index_df_to_component_dfs(
         signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME)

    # Append to queries a new column saying what connectivity metric was used
    sip.add_connectivity_metric_to_metadata(signed_col_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)
    sip.add_connectivity_metric_to_metadata(signed_row_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)

    # Assemble connectivity gct
    conn_gct = GCToo.GCToo(data_df=signed_data_df,
                           row_metadata_df=signed_row_metadata_df,
                           col_metadata_df=signed_col_metadata_df)

    return sim_gct, conn_gct
예제 #4
0
def do_steep_and_sip(gct, similarity_metric, connectivity_metric,
                     fields_to_aggregate):
    """ Perform steep and sip on the same GCT. AKA introspect.

	Args:
	    gct:
	    similarity_metric:
	    connectivity_metric:
	    fields_to_aggregate:

	Returns:
	    sim_gct
	    conn_gct

	"""

    #----------STEEP--------#

    sim_df = steep.compute_similarity_within_df(gct.data_df, similarity_metric)

    # Row and column metadata are both from gct
    metadata_df = gct.col_metadata_df

    # Append column to metadata_df indicating which similarity_metric was used
    metadata_df[SIMILARITY_METRIC_FIELD] = similarity_metric

    # Assemble similarity gct
    sim_gct = GCToo.GCToo(sim_df,
                          metadata_df,
                          metadata_df,
                          make_multiindex=True)

    #----------SIP----------#

    #  Create an aggregated metadata field for index and columns of sim_gct
    #  and sort by that field

    (test_df, bg_df) = sip.prepare_multi_index_dfs(
        sim_gct.multi_index_df, sim_gct.multi_index_df, fields_to_aggregate,
        fields_to_aggregate, fields_to_aggregate, QUERY_FIELD_NAME,
        TARGET_FIELD_NAME, SEPARATOR)

    # Check symmetry
    (is_test_df_sym, _) = sip.check_symmetry(sim_gct.multi_index_df,
                                             sim_gct.multi_index_df)

    # Compute connectivity
    (_, signed_conn_mi_df) = sip.compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df,
     signed_col_metadata_df) = (GCToo.multi_index_df_to_component_dfs(
         signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME))

    # Append to queries a new column saying what connectivity metric was used
    sip.add_connectivity_metric_to_metadata(signed_col_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)
    sip.add_connectivity_metric_to_metadata(signed_row_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)

    # Assemble connectivity gct
    conn_gct = GCToo.GCToo(data_df=signed_data_df,
                           row_metadata_df=signed_row_metadata_df,
                           col_metadata_df=signed_col_metadata_df)

    return sim_gct, conn_gct