print("Running on the node {}".format(os.uname()[1])) except: pass sleep(np.random.randint(90)) print("Polling the queue") remote_queue = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data", "group_datasets", "tables.txt") correlation_tables = Utilities.remove_empty_values( Utilities.load_list(remote_queue)) if len(correlation_tables) == 0: print("Empty remote queue") sys.exit(0) Utilities.dump_list(correlation_tables[1:], remote_queue) correlation_table = correlation_tables[0] print("Now processing: '{}'".format(correlation_table)) group_name = os.path.splitext(os.path.basename(correlation_table))[0] out_dir = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data", "group_results", group_name) correlation_df = load_tsv(correlation_table).dropna(axis=0, how="any") feature_groups = sorted(set([i.split("@")[0] for i in correlation_df.columns])) if len(feature_groups) < 2: queue = list(combinations(correlation_df.columns, 2)) else: queue = list(
]) feature_dfs[feature_name] = feature_df.query( "sample_source == '{}'".format(sample_source)) feature_dfs.update(raw_data_dfs) correlation_dir = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data", "group_datasets") group_combinations = list( product(combinations_with_replacement(feature_dfs.keys(), 2), AGE_GROUPS, DIAGNOSIS_GROUPS)) correlation_tables = [] for feature_pair, age, diagnosis in group_combinations: query = "age == '{}' and diagnosis == '{}'".format(age, diagnosis) correlation_df = concat([ select_data_columns(feature_dfs[i].query(query).reset_index()) for i in sorted(set(feature_pair)) ]) correlation_table = os.path.join( correlation_dir, "{}_for_{}_{}.tsv".format(" vs ".join(feature_pair), age, diagnosis)) dump_tsv(correlation_df, correlation_table) correlation_tables.append(correlation_table) Utilities.dump_list(correlation_tables, os.path.join(correlation_dir, "tables.txt")) Utilities.dump_list(correlation_tables, os.path.join(correlation_dir, "tables.txt.bak"))