def test_top_bottom(self): top_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_top.gct") bottom_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_bottom.gct") expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merged_top_bottom.gct") top_gct = pg.parse(top_gct_path) bottom_gct = pg.parse(bottom_gct_path) expected_gct = pg.parse(expected_gct_path) # Merge top and bottom concated_gct = cg.vstack([top_gct, bottom_gct], False, None, [], False) pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False) pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False) pd.util.testing.assert_frame_equal(expected_gct.col_metadata_df, concated_gct.col_metadata_df, check_names=False)
def main(args): # Record start_time start_time = datetime.datetime.now() start_time_msg = "external_query_many.py started at {}".format( start_time.strftime('%Y-%m-%d %H:%M:%S')) # Create output directory assert os.path.exists(args.out_dir), "args.out_dir: {}".format( args.out_dir) try: # Read and unpack config file (cells, internal_gct_dir, bg_gct_dir, fields_to_aggregate_for_internal_profiles, similarity_metric, connectivity_metric) = read_config_file(args.psp_on_clue_config_path) # Read in the external profiles only once external_gct = parse.parse(args.external_gct_path) # If requested, do introspect (_, introspect_gct) = introspect.do_steep_and_sip( external_gct, similarity_metric, connectivity_metric, args.fields_to_aggregate_for_external_profiles) # Write introspect result actual_out_introspect_name = os.path.join(args.out_dir, OUT_INTROSPECT_NAME) wg.write(introspect_gct, actual_out_introspect_name, data_null="NaN", metadata_null="NaN", filler_null="NaN") # Initialize list to store connectivity gcts list_of_conn_gcts = [] # Loop over cell lines in corpus for cell in cells: # Import gct with the internal profiles for this cell line internal_gct_path = os.path.join( internal_gct_dir, INTERNAL_GCT_FORMAT.format(assay=args.assay, cell=cell)) internal_gct = parse.parse(internal_gct_path) # Import gct with the similarity matrix for this cell line bg_gct_path = os.path.join( bg_gct_dir, BG_GCT_FORMAT.format(assay=args.assay, cell=cell)) bg_gct = parse.parse(bg_gct_path) (sim_gct, conn_gct) = eq.do_steep_and_sip( external_gct, internal_gct, bg_gct, "spearman", "ks_test", args.fields_to_aggregate_for_external_profiles, fields_to_aggregate_for_internal_profiles) # Append this connectivity gct list_of_conn_gcts.append(conn_gct) # Write all output gcts if requested if args.all: out_steep_name = os.path.join( args.out_dir, OUT_STEEP_FORMAT.format(cell=cell)) out_sip_name = os.path.join(args.out_dir, OUT_SIP_FORMAT.format(cell=cell)) wg.write(sim_gct, out_steep_name) wg.write(conn_gct, out_sip_name) # Concatenate connectivity GCTs concated = cg.vstack(list_of_conn_gcts) actual_out_concated_name = os.path.join(args.out_dir, OUT_CONCATED_NAME) # Write concatenated result wg.write(concated, actual_out_concated_name, data_null="NaN", filler_null="NaN", metadata_null="NaN") # Write success.txt with timestamp success_path = os.path.join(args.out_dir, "success.txt") write_success(success_path, start_time_msg) # Return how much time it took end_time = datetime.datetime.now() seconds_elapsed = (end_time - start_time).seconds logger.info("external_query_many.py completed in {:.0f} sec.".format( seconds_elapsed)) except Exception: failure_path = os.path.join(args.out_dir, "failure.txt") msg = "external_query_many.py failed. See {} for stacktrace.".format( failure_path) # Write failure.txt write_failure(failure_path, start_time_msg) # Raise exception logger.error(msg) raise Exception(msg) return None