def test_multi_index_df_to_component_dfs(self): mi_df_index = pd.MultiIndex.from_arrays( [["D", "E"], [-666, -666], ["dd", "ee"]], names=["rid", "rhd1", "rhd2"]) mi_df_columns = pd.MultiIndex.from_arrays( [["A", "B", "C"], [1, 2, 3], ["Z", "Y", "X"]], names=["cid", "chd1", "chd2"]) mi_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]], index=mi_df_index, columns=mi_df_columns) e_row_metadata_df = pd.DataFrame([[-666, "dd"], [-666, "ee"]], index=pd.Index(["D", "E"], name="rid"), columns=pd.Index(["rhd1", "rhd2"], name="rhd")) e_col_metadata_df = pd.DataFrame([[1, "Z"], [2, "Y"], [3, "X"]], index=pd.Index(["A", "B", "C"], name="cid"), columns=pd.Index(["chd1", "chd2"], name="chd")) e_data_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]], index=pd.Index(["D", "E"], name="rid"), columns=pd.Index(["A", "B", "C"], name="cid")) (data_df, row_df, col_df) = GCToo.multi_index_df_to_component_dfs(mi_df) self.assertTrue(col_df.equals(e_col_metadata_df)) self.assertTrue(row_df.equals(e_row_metadata_df)) self.assertTrue(data_df.equals(e_data_df)) # edge case: if the index (or column) of the multi-index has only one # level, it becomes a regular index mi_df_index_plain = pd.MultiIndex.from_arrays([["D", "E"]], names=["rid"]) mi_df2 = pd.DataFrame([[1, 3, 5], [7, 11, 13]], index=mi_df_index_plain, columns=mi_df_columns) # row df should be empty e_row_df2 = pd.DataFrame(index=["D", "E"]) (data_df2, row_df2, col_df2) = GCToo.multi_index_df_to_component_dfs(mi_df2) self.assertTrue(row_df2.equals(e_row_df2)) self.assertTrue(col_df2.equals(e_col_metadata_df)) self.assertTrue(data_df2.equals(e_data_df))
def main(args): """ The main method. """ # Read test gct test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True) # Read bg_gct bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True) # Create an aggregated metadata field for index and columns of both gcts # and sort by that field (test_df, bg_df) = prepare_multi_index_dfs( test_gct.multi_index_df, bg_gct.multi_index_df, args.fields_to_aggregate_in_test_gct_queries, args.fields_to_aggregate_in_test_gct_targets, args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, args.separator) # Check symmetry (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df) # Compute connectivity (conn_mi_df, signed_conn_mi_df) = compute_connectivities( test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym) # Convert multi-index to component dfs in order to write output gct (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = ( GCToo.multi_index_df_to_component_dfs( signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME)) # Append to queries a new column saying what connectivity metric was used add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Create gct and write it to file conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df) wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def do_steep_and_sip(external_gct, internal_gct, bg_gct, similarity_metric, connectivity_metric, fields_to_aggregate_for_external_profiles, fields_to_aggregate_for_internal_profiles): #----------STEEP----------# # Compute similarity between external and internal profiles sim_df = steep.compute_similarity_bw_two_dfs(internal_gct.data_df, external_gct.data_df, similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_for_sim_df = internal_gct.col_metadata_df col_metadata_for_sim_df = external_gct.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric col_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric # Assemble similarity gct sim_gct = GCToo.GCToo(sim_df, row_metadata_for_sim_df, col_metadata_for_sim_df, make_multiindex=True) #----------SIP----------# # Create an aggregated metadata field for index and columns of both gcts # and sort by that field (test_df, bg_df) = sip.prepare_multi_index_dfs( sim_gct.multi_index_df, bg_gct.multi_index_df, fields_to_aggregate_for_external_profiles, fields_to_aggregate_for_internal_profiles, fields_to_aggregate_for_internal_profiles, QUERY_FIELD_NAME, TARGET_FIELD_NAME, SEPARATOR) # Check symmetry (is_test_df_sym, is_bg_df_sym) = sip.check_symmetry(sim_gct.multi_index_df, bg_gct.multi_index_df) # Compute connectivity (conn_mi_df, signed_conn_mi_df) = sip.compute_connectivities( test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, connectivity_metric, is_test_df_sym) # Convert multi-index to component dfs in order to write output gct (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = GCToo.multi_index_df_to_component_dfs( signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME) # Append to queries a new column saying what connectivity metric was used sip.add_connectivity_metric_to_metadata(signed_col_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) sip.add_connectivity_metric_to_metadata(signed_row_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Assemble connectivity gct conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df) return sim_gct, conn_gct
def do_steep_and_sip(gct, similarity_metric, connectivity_metric, fields_to_aggregate): """ Perform steep and sip on the same GCT. AKA introspect. Args: gct: similarity_metric: connectivity_metric: fields_to_aggregate: Returns: sim_gct conn_gct """ #----------STEEP--------# sim_df = steep.compute_similarity_within_df(gct.data_df, similarity_metric) # Row and column metadata are both from gct metadata_df = gct.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = similarity_metric # Assemble similarity gct sim_gct = GCToo.GCToo(sim_df, metadata_df, metadata_df, make_multiindex=True) #----------SIP----------# # Create an aggregated metadata field for index and columns of sim_gct # and sort by that field (test_df, bg_df) = sip.prepare_multi_index_dfs( sim_gct.multi_index_df, sim_gct.multi_index_df, fields_to_aggregate, fields_to_aggregate, fields_to_aggregate, QUERY_FIELD_NAME, TARGET_FIELD_NAME, SEPARATOR) # Check symmetry (is_test_df_sym, _) = sip.check_symmetry(sim_gct.multi_index_df, sim_gct.multi_index_df) # Compute connectivity (_, signed_conn_mi_df) = sip.compute_connectivities( test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, connectivity_metric, is_test_df_sym) # Convert multi-index to component dfs in order to write output gct (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = (GCToo.multi_index_df_to_component_dfs( signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME)) # Append to queries a new column saying what connectivity metric was used sip.add_connectivity_metric_to_metadata(signed_col_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) sip.add_connectivity_metric_to_metadata(signed_row_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Assemble connectivity gct conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df) return sim_gct, conn_gct