def test_main(self): gct_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping_in.gct") mapping_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping.tsv") expected_gct_path = os.path.join( functional_tests_dir, "test_annotate_gct_from_mapping_expected.gct") out_path = os.path.join(functional_tests_dir, "test_annotate_gct_from_mapping_out.gct") args_string = "-i {} -m {} -o {} -f {}".format(gct_path, mapping_path, out_path, "pert_iname") args = agfm.build_parser().parse_args(args_string.split()) agfm.main(args) # Read in expected and actual outputs e_gct = parse(expected_gct_path) out_gct = parse(out_path) pd.util.testing.assert_frame_equal(e_gct.data_df, out_gct.data_df) pd.util.testing.assert_frame_equal(e_gct.row_metadata_df, out_gct.row_metadata_df) pd.util.testing.assert_frame_equal(e_gct.col_metadata_df, out_gct.col_metadata_df) # Clean up os.remove(out_path)
def test_main(self): test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_test.gct") bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct") out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct") args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {}".format( test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname", "pert_iname") args = sip.build_parser().parse_args(args_string.split()) # Run main method sip.main(args) # Compare the output of main with the expected output e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_expected_conn.gct") e_out_gct = parse(e_out_path) out_gct = parse(out_path) self.assertTrue( np.allclose(e_out_gct.data_df.values, out_gct.data_df.values), ("\ne_out_gct.data_df:\n{}\nout_gct.data_df:\n{}".format( e_out_gct.data_df, out_gct.data_df))) self.assertTrue( e_out_gct.row_metadata_df.equals(out_gct.row_metadata_df), ("\ne_out_gct.row_metadata_df:\n{}\nout_gct.row_metadata_df:\n{}". format(e_out_gct.row_metadata_df, out_gct.row_metadata_df))) self.assertTrue( e_out_gct.col_metadata_df.equals(out_gct.col_metadata_df), ("\ne_out_gct.col_metadata_df:\n{}\nout_gct.col_metadata_df:\n{}". format(e_out_gct.col_metadata_df, out_gct.col_metadata_df))) # Remove the created file os.remove(out_path)
def main(args): # Parse input gcts external_gct = parse(args.external_gct_path, convert_neg_666=False, make_multiindex=True) internal_gct = parse(args.internal_gct_path, convert_neg_666=False, make_multiindex=True) bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True) # Meat of the script (sim_gct, conn_gct) = do_steep_and_sip( external_gct, internal_gct, bg_gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate_for_external_profiles, args.fields_to_aggregate_for_internal_profiles) # Write output gcts wg.write(sim_gct, args.out_steep_name, data_null="NaN", metadata_null="NaN", filler_null="NaN") wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def setUpClass(cls): external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_external.gct") cls.external_gct = parse(external_gct_path, convert_neg_666=False, make_multiindex=True) internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_internal.gct") cls.internal_gct = parse(internal_gct_path, convert_neg_666=False, make_multiindex=True) bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_bg.gct") cls.bg_gct = parse(bg_gct_path, convert_neg_666=False, make_multiindex=True)
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct if os.path.splitext(args.out_name)[1] == ".gct": wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif os.path.splitext(args.out_name)[1] == ".gctx": wgx.write(out_gct, args.out_name) else: raise (Exception( "out_name must end in .gct or .gctx. out_name: {}".format( args.out_name)))
def setUpClass(cls): external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_external.gct") cls.external_gct = parse(external_gct_path) internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_internal.gct") cls.internal_gct = parse(internal_gct_path) bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_bg.gct") cls.bg_gct = parse(bg_gct_path)
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path, convert_neg_666=False, make_multiindex=True) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def main(args): """ The main method. """ # Import gct in_gct = parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) # Write to GCT or GCTX depending on extension if str.lower(os.path.splitext(full_out_name)[1]) == ".gct": wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx": wgx.write(gct, full_out_name) else: raise (Exception( "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}" .format((args.out_name_suffix))))
def main(args): # Parse gct file gct = parse(args.path_to_gct) # Parse mapping tsv file mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0) # Make sure the ids from the mapping file are unique duplicated_bool_array = mapping.index.duplicated() assert sum(duplicated_bool_array) == 0, ( "ids in mapping file must be unique. duplicated ids in mapping:\n{}". format(mapping.index[duplicated_bool_array])) for col in mapping.columns: if args.row_and_or_col == "both": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "row": annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) elif args.row_and_or_col == "col": annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col], args.gct_from_field, args.missing_entry) wg.write(gct, args.out_name, filler_null="NA", data_null="NaN", metadata_null="NA")
def main(args): # Import data assert os.path.exists( args.in_gct_path), ("in_gct_path could not be found: {}").format( args.in_gct_path) in_gct = parse(args.in_gct_path) # First, check if any rows are all NaN; if so, remove them dropped_df = in_gct.data_df.dropna(how="all") bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values) in_gct = sg.slice_gctoo(in_gct, row_bool=bools_of_remaining) if args.replace_with == "zero": in_gct.data_df.fillna(0, inplace=True) elif args.replace_with == "median": probe_medians = in_gct.data_df.median(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_medians[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row elif args.replace_with == "mean": probe_means = in_gct.data_df.mean(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_means[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row wg.write(in_gct, args.out_name, filler_null="NA")
def read_gct_and_config_file(gct_path, config_path): """Read gct and config file. The config file has three sections: io, metadata, and parameters. These are returned as dictionaries. Args: gct_path (string): filepath to gct file config_path (string): filepath to config file Returns: gct (GCToo object) config_io (dictionary) config_metadata (dictionary) config_parameters (dictionary) """ assert os.path.exists(os.path.expanduser(config_path)) # Read config file config_parser = ConfigParser.RawConfigParser() config_parser.read(os.path.expanduser(config_path)) # Return config fields as dictionarires config_io = dict(config_parser.items("io")) config_metadata = dict(config_parser.items("metadata")) config_parameters = dict(config_parser.items("parameters")) # Parse the gct file and return GCToo object gct = parse(gct_path) return gct, config_io, config_metadata, config_parameters
def save_drug_dataset(only_landmark_genes=False): expression_df = parse(gctx_path).data_df.transpose() label_df = pd.DataFrame.from_csv(label_path) signatures_df = pd.DataFrame.from_csv(sig_path, sep='\t') gene_info_df = pd.DataFrame.from_csv(gene_path, sep='\t') print "Expression DataFrame Shape:", expression_df.shape if only_landmark_genes: del_gene_list = [] landmark_genes = gene_info_df.loc[gene_info_df['pr_is_lm'] == 1].index.values.tolist() for gene_id in expression_df.columns.values.tolist(): if gene_id not in landmark_genes: del_gene_list.append(gene_id) expression_df = expression_df.drop(del_gene_list, axis=1) print "Expression DataFrame Only Landmark Genes Shape:", expression_df.shape drug_expression, drug_perturbations = get_drug_data( expression_df, signatures_df, label_df) label_df = get_drug_labels(drug_perturbations.keys(), label_df) dataset = dict() dataset["drug_expression"] = drug_expression dataset["drug_perturbations"] = drug_perturbations dataset["label_df"] = label_df with open(pickle_all_data_path, 'wb') as handle: pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL) return drug_perturbations, drug_expression, label_df
def main(): # get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) logger.debug("args: {}".format(args)) # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def main(): #read drug protein interaction data form drugbank. protein_drug_list, drug_name_list = FeatureGeneration.read_DrugBank() pertID_drugbankID_dict = generate_broadpert_DrugBank_dict() #put all pertID with drugBank drug-protein interaction data into a dictionary... pertID_with_drugbank_interaction_dict = {} selected_drug_protein_list_with_CMap_data = {} for keys in pertID_drugbankID_dict: if pertID_drugbankID_dict[keys] in drug_name_list: pertID_with_drugbank_interaction_dict[keys] = pertID_drugbankID_dict[keys] #find the drug-protein pairs with drugs found in CMap... for drug_protein_pair in protein_drug_list: if pertID_drugbankID_dict[keys] == drug_protein_pair[1]: selected_drug_protein_list_with_CMap_data[keys] = drug_protein_pair print "selected_drug_protein_list_with_CMap_data_overlap: " + str(len(selected_drug_protein_list_with_CMap_data)) sys.exit() # a way to generate mol objecy from SMILE string directly... m2 = Chem.MolFromSmiles('C1CCC1') Mogen2_matrix = FeatureGeneration.generate_fingerprint("Morgan2", [m2]) # play with GEO dataset.. sig_info = pd.read_csv("GSE92742_Broad_LINCS_sig_info.txt", sep="\t") selected_sig_id_list = [] test = [] # get the ids for signature IDs for those perturbation drugs in both drug-target interaction pairs and CMap... ~ 2700 for key in pertID_with_drugbank_interaction_dict: selected_sig_id_list.append(sig_info["sig_id"][sig_info["pert_id"] == key]) gene_info = pd.read_csv("GSE92742_Broad_LINCS_gene_info.txt", sep="\t", dtype=str) landmark_gene_row_ids = gene_info["pr_gene_id"][gene_info["pr_is_lm"] == "1"] my_col_metadata = parse("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", col_meta_only=True) print my_col_metadata print type(my_col_metadata) print np.shape(my_col_metadata) #vorinostat_only_gctoo = parse("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=vorinostat_ids) #test = vorinostat_only_gctoo.data_df.as_matrix() #print type(test) #print np.shape(test) #print test sys.exit()
def main(args): # Find files full_path_wildcard = args.in_dir + args.file_wildcard gct_paths = glob.glob(full_path_wildcard) assert len(gct_paths) > 1, "full_path_wildcard: {}".format( full_path_wildcard) # Extract prefixes in order to use them later for saving prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0] for path in gct_paths] for path, prefix in zip(gct_paths, prefixes): print "path: {}".format(path) print "prefix: {}".format(prefix) # Import gcts gctoos = [parse(x) for x in gct_paths] assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths) # Compute & save ranks for g, prefix in zip(gctoos, prefixes): # Extract data_df score_df = g.data_df # Must be square assert score_df.shape[0] == score_df.shape[ 1], "Input dataframe must be square." # Set diagonal to NaN np.fill_diagonal(score_df.values, np.nan) # Rank the matrix (percentile score or not) if args.do_percentile_rank: rank_df = score_df.rank(ascending=False, pct=True) * 100 else: rank_df = score_df.rank(ascending=False) # Make a GCToo rank_gctoo = GCToo.GCToo(data_df=rank_df, row_metadata_df=g.row_metadata_df, col_metadata_df=g.col_metadata_df) # Save the rank_df to file out_name = args.out_dir + prefix + args.output_suffix wg.write(rank_gctoo, out_name, filler_null="NaN", data_null="NaN", metadata_null="NaN")
def get_dataset(): expression_df = parse(gctx_path).data_df.transpose() label_df = pd.DataFrame.from_csv(label_path) sig_info_df = pd.DataFrame.from_csv(sig_path, sep='\t') print "Expression shape:", expression_df.shape label_pert_ids = label_df.index.values del_sig_list = [] # fill list with perturbation ids to be deleted for sig_id in expression_df.index.values: if sig_info_df.loc[sig_id, "pert_id"] not in label_pert_ids: del_sig_list.append(sig_id) # delete perturbations from data frame expression_df = expression_df.drop(del_sig_list) expression_df = expression_df.sample(frac=1) # collect signature side effect labels sig_label_data = [] for sig_id in expression_df.index.values: pert_id = sig_info_df.loc[sig_id, "pert_id"] sig_labels = label_df.loc[pert_id] sig_label_data.append(sig_labels.values) sig_label_df = pd.DataFrame(data=sig_label_data, index=expression_df.index.values, columns=label_df.columns.values) print "Before column pruning y shape:", sig_label_df.shape adr_names = list(sig_label_df) del_adr_names = list() for adr_name in adr_names: if np.sum(sig_label_df.loc[:, adr_name].values) < prune_count: del_adr_names.append(adr_name) sig_label_df = sig_label_df.drop(del_adr_names, axis=1) print "After column pruning y shape:", sig_label_df.shape train_cnt = int(floor(expression_df.shape[0] * train_size)) x_train = expression_df.iloc[0:train_cnt] y_train = sig_label_df.iloc[0:train_cnt] x_test = expression_df.iloc[train_cnt:] y_test = sig_label_df.iloc[train_cnt:] print "Before train/test column pruning y shape:", y_train.shape, y_test.shape adr_names = list(sig_label_df) del_adr_names = list() for adr_name in adr_names: if np.sum(y_train.loc[:, adr_name].values) < 1 or np.sum( y_test.loc[:, adr_name].values) < 1: del_adr_names.append(adr_name) y_train = y_train.drop(del_adr_names, axis=1) y_test = y_test.drop(del_adr_names, axis=1) print "After train/test column pruning y shape:", y_train.shape, y_test.shape return x_train, y_train, x_test, y_test
def main(args): """ The main method. """ # Read test gct test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True) # Read bg_gct bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True) # Create an aggregated metadata field for index and columns of both gcts # and sort by that field (test_df, bg_df) = prepare_multi_index_dfs( test_gct.multi_index_df, bg_gct.multi_index_df, args.fields_to_aggregate_in_test_gct_queries, args.fields_to_aggregate_in_test_gct_targets, args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, args.separator) # Check symmetry (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df) # Compute connectivity (conn_mi_df, signed_conn_mi_df) = compute_connectivities( test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym) # Convert multi-index to component dfs in order to write output gct (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = ( GCToo.multi_index_df_to_component_dfs( signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME)) # Append to queries a new column saying what connectivity metric was used add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Create gct and write it to file conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df) wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def test_main(self): test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_test.gct") bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct") out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct") args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {} -s {}".format( test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname", "pert_iname", "|") args = sip.build_parser().parse_args(args_string.split()) # Run main method sip.main(args) # Compare the output of main with the expected output e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_expected_conn.gct") e_out_gct = parse(e_out_path) out_gct = parse(out_path) logger.debug("e_out_gct.data_df:\n{}".format(e_out_gct.data_df)) logger.debug("out_gct.data_df:\n{}".format(out_gct.data_df)) pd.util.testing.assert_frame_equal(e_out_gct.data_df, out_gct.data_df) logger.debug("e_out_gct.row_metadata_df:\n{}".format( e_out_gct.row_metadata_df)) logger.debug("out_gct.row_metadata_df:\n{}".format( out_gct.row_metadata_df)) pd.util.testing.assert_frame_equal(e_out_gct.row_metadata_df, out_gct.row_metadata_df) logger.debug("e_out_gct.col_metadata_df:\n{}".format( e_out_gct.col_metadata_df)) logger.debug("out_gct.col_metadata_df:\n{}".format( out_gct.col_metadata_df)) pd.util.testing.assert_frame_equal(e_out_gct.col_metadata_df, out_gct.col_metadata_df) # Remove the created file os.remove(out_path)
def main(args): """ The main method. """ # Read test gct test_gct = parse(args.test_gct_path) # Read bg_gct bg_gct = parse(args.bg_gct_path) # Check symmetry (is_test_df_sym, _) = check_symmetry(test_gct.data_df, bg_gct.data_df) # Create an aggregated metadata field in test and background GCTs # that will be used to aggregate replicates (test_gct, bg_gct) = create_aggregated_fields_in_GCTs( test_gct, bg_gct, args.fields_to_aggregate_in_test_gct_queries, args.fields_to_aggregate_in_test_gct_targets, args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, args.separator) # Compute connectivity (conn_gct, signed_conn_gct) = compute_connectivities( test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym, args.separator) # Append to queries a new column saying what connectivity metric was used add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD) # Write signed result to file wg.write(signed_conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def load(self): """ Calls the cmapPy gctx parser, retrieves matrix and metadata returns: None """ self.data = GEX.parse(self.path).data_df ##Dealing with cmapPy data type instability: rows = list(map(lambda x: x[2:-1], list(self.data.index))) self.data.index = rows columns = list(map(lambda x: x[2:-1], list(self.data))) self.data.columns = columns
def main(args): gct = parse(args.in_gct_path) (_, conn_gct) = do_steep_and_sip(gct, args.similarity_metric, args.connectivity_metric, args.fields_to_aggregate) # Write output gct wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
def main(args): # Import data in_gct = parse(args.in_gct_path) # Compute distance df dist_df = 1 - in_gct.data_df # Create distance gct dist_gct = GCToo.GCToo(dist_df, in_gct.row_metadata_df, in_gct.col_metadata_df) # Write dist_gct to file wg.write(dist_gct, args.out_name, filler_null="NA")
def test_main1(self): input_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main.gct") output_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main_out.gct") expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_introspect_main_expected.gct") args_string = "-i {} -o {} -fa chd1".format(input_gct_path, output_gct_path) args = introspect.build_parser().parse_args(args_string.split()) introspect.main(args) # Read in output and expected gcts and confirm that they're equal output_gct = parse(output_gct_path) expected_gct = parse(expected_gct_path) pd.util.testing.assert_almost_equal(expected_gct.data_df, output_gct.data_df, check_less_precise=2) pd.testing.assert_frame_equal(expected_gct.row_metadata_df, output_gct.row_metadata_df) pd.testing.assert_frame_equal(expected_gct.col_metadata_df, output_gct.col_metadata_df) # Clean up os.remove(output_gct_path)
def read_reduced(): """ Reads in the reduced file outputs a data_frame with the maximal magnitude signature for each pert_id :return reduce l1000 feature dataframe (as pandas dataframe) """ ### read in the reduced data reduced_data = parse(join(FILE_PATH, "lm_sm_aggz.gctx")) ### read in the signature info and set the index to the signature id for easy indexing in the next step sig_info = pd.read_csv(join(FILE_PATH, "GSE92742_Broad_LINCS_sig_info.txt"), sep="\t") sig_info.index = sig_info['sig_id'] ### map the columns to the pert_id that generated the signature to allow for comparison in spark reduced_data.data_df.columns = sig_info.loc[pd.Index( reduced_data.data_df.columns)]['pert_id'] ### return data_frame with pert_ids in row_major form ready for scala return reduced_data.data_df.transpose()
def main(args): # Parse gct gct = parse(args.input_gct_path) # TODO(LL): better integrate main_sym and main_asym # Figure out whether or not the gct is symmetric if gct.row_metadata_df.equals(gct.col_metadata_df): logger.info(("Row metadata equals column metadata. " + "Assuming symmetric GCT.")) assert args.row_annot_fields == args.col_annot_fields, ( ("row_annot_fields should be the same as col_annot_fields if the " + "GCT is symmetric. args.row_annot_fields: {}, " + "args.col_annot_fields: {}").format(args.row_annot_fields, args.col_annot_fields)) assert args.query_in_row_or_col is None, ( ("query_in_row_or_col should be None for symmetric GCTs. " + "args.query_in_row_or_col: {}").format(args.query_in_row_or_col)) # Main method for symmetric gcts main_sym(gct, args.out_fig_name, args.out_gml_name, args.row_annot_fields, args.my_query, args.query_field, args.threshold, args.percentile, args.vertex_label_field, args.vertex_color_field, layout=LAYOUT) else: logger.info(("Row metadata does not equal column metadata. " + "Assuming asymmetric GCT.")) assert args.query_in_row_or_col != "both", ( ("query_in_row_or_col must not be 'both' if the matrix is " + "asymmetric. args.query_in_row_or_col: {}").format( args.query_in_row_or_col)) # Main method for asymmetric gcts main_asym(gct, args.out_fig_name, args.out_gml_name, args.row_annot_fields, args.col_annot_fields, args.my_query, args.query_field, args.query_in_row_or_col, args.threshold, args.percentile, args.vertex_label_field, args.vertex_color_field)
def main(args): """ The main method. """ # Import gct in_gct = parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def reduce_and_save(): """ Reads in the level 5 data and outputs a file with only the landmark gene z-scores(rows and the small molecule perterbagens (cols) """ ### Get the signature information sig_info = pd.read_csv(join(FILE_PATH, "GSE92742_Broad_LINCS_sig_info.txt"), sep="\t") ### Columns are: ### Index([u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id', ### u'pert_dose', u'pert_dose_unit', u'pert_idose', u'pert_time', ### u'pert_time_unit', u'pert_itime', u'distil_id'], ### dtype='object') ### Filter for signature ids for small molecule pertubagens small_mol_sigs = sig_info['sig_id'][sig_info['pert_type'] == "trt_cp"] ### Results in 205034 signatures ### Read in the gene info gene_info = pd.read_csv(join(FILE_PATH, "GSE92742_Broad_LINCS_gene_info.txt"), sep='\t') ### Index([u'pr_gene_id', u'pr_gene_symbol', u'pr_gene_title', u'pr_is_lm', ### u'pr_is_bing'], ### dtype='object') landmark_gene_ids = gene_info['pr_gene_id'][ gene_info['pr_is_lm'] == 1] #Filters for directly measured transcripts ### Results in the 978 landmark pr_gene_ids ### LOAD in the main file filtering the columns so that only the small molecules signatures are loaded and the ### rows such that only the landmark genes are loaded into their custom gctoo container type relevent_sigs_gctoo = parse(join( FILE_PATH, "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"), cid=small_mol_sigs, rid=landmark_gene_ids) # print small_mol_sigs.data_df.shape ### Should write an intermediate file with dimensions (978, 205034) write_gctx.write(relevent_sigs_gctoo, join(FILE_PATH, "lm_sm_aggz"))
def test_gct_parsing(self): # parse in gct, no other arguments mg1 = mini_gctoo_for_testing.make() mg2 = parse("functional_tests/mini_gctoo_for_testing.gct") pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # check convert_neg_666 worked correctly self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) # parse w/o convert_neg_666 mg2_alt = parse("functional_tests/mini_gctoo_for_testing.gct", convert_neg_666=False) self.assertFalse( mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all()) # check unused rid argument handling with self.assertRaises(Exception) as context: mg3 = parse("functional_tests/mini_gctoo_for_testing.gct", rid=["a"]) self.assertTrue( "parse_gct does not use the argument" in str(context.exception)) # check unused cid argument handling with self.assertRaises(Exception) as context: mg4 = parse("functional_tests/mini_gctoo_for_testing.gct", cid=["a"]) self.assertTrue( "parse_gct does not use the argument" in str(context.exception)) # check unused ridx argument handling with self.assertRaises(Exception) as context: mg5 = parse("functional_tests/mini_gctoo_for_testing.gct", ridx=[0]) self.assertTrue( "parse_gct does not use the argument" in str(context.exception)) # check unused cidx argument handling with self.assertRaises(Exception) as context: mg6 = parse("functional_tests/mini_gctoo_for_testing.gct", cidx=[0]) self.assertTrue( "parse_gct does not use the argument" in str(context.exception))
def main(args): # Import gct gct = parse(args.gct_file_path) # Get plate and well names (plate_names, well_names) = extract_plate_and_well_names(gct.col_metadata_df, args.plate_field, args.well_field) # Extract provenance code prov_code = utils.extract_prov_code(gct.col_metadata_df, PROV_CODE_FIELD, PROV_CODE_DELIMITER) # If data has been log-transformed, undo it unlogged_df = undo_log_transform_if_needed(gct.data_df, prov_code) # Divide by the maximum value for the row max_row_values = unlogged_df.max(axis='columns') divided_df = unlogged_df.div(max_row_values, axis="rows") # Calculate metrics for each sample medium_over_heavy_medians = divided_df.median(axis=0).values medium_over_heavy_means = divided_df.mean(axis=0).values medium_over_heavy_mads = divided_df.mad(axis=0).values medium_over_heavy_sds = divided_df.std(axis=0).values # Assemble plate_names, well_names, and metrics into a dataframe out_df = assemble_output_df( plate_names, well_names, { "medium_over_heavy_median": medium_over_heavy_medians, "medium_over_heavy_mad": medium_over_heavy_mads }) # Write to pw file out_df.to_csv(args.out_pw_file_path, sep="\t", na_rep="NaN", index=False) logger.info("PW file written to {}".format(args.out_pw_file_path))
import cmapPy.pandasGEXpress.parse as parse import broadinstitute_psp.utils.separate_gct as sg import broadinstitute_psp.utils.setup_logger as setup_logger logger = logging.getLogger(setup_logger.LOGGER_NAME) functional_tests_dir = "utils/functional_tests/" in_gct_path = functional_tests_dir + "test_separate_in.gct" thing1_gct_path = functional_tests_dir + "test_separate_expected_thing1.gct" thing2_gct_path = functional_tests_dir + "test_separate_expected_thing2.gct" a375_gct_path = functional_tests_dir + "test_separate_expected_A375.gct" ht29_gct_path = functional_tests_dir + "test_separate_expected_HT29.gct" a549_gct_path = functional_tests_dir + "test_separate_expected_A549.gct" in_gct = parse(in_gct_path) thing1_gct = parse(thing1_gct_path) thing2_gct = parse(thing2_gct_path) a375_gct = parse(a375_gct_path) ht29_gct = parse(ht29_gct_path) a549_gct = parse(a549_gct_path) class TestSeparateGct(unittest.TestCase): def test_separate_row(self): (thing_gcts, thing_fields) = sg.separate(in_gct, "thing", "row") self.assertListEqual(thing_fields, [1, 2]) pd.util.testing.assert_frame_equal(thing_gcts[0].data_df, thing1_gct.data_df)