Example #1
0
    def test_main(self):
        gct_path = os.path.join(functional_tests_dir,
                                "test_annotate_gct_from_mapping_in.gct")
        mapping_path = os.path.join(functional_tests_dir,
                                    "test_annotate_gct_from_mapping.tsv")
        expected_gct_path = os.path.join(
            functional_tests_dir,
            "test_annotate_gct_from_mapping_expected.gct")
        out_path = os.path.join(functional_tests_dir,
                                "test_annotate_gct_from_mapping_out.gct")

        args_string = "-i {} -m {} -o {} -f {}".format(gct_path, mapping_path,
                                                       out_path, "pert_iname")
        args = agfm.build_parser().parse_args(args_string.split())

        agfm.main(args)

        # Read in expected and actual outputs
        e_gct = parse(expected_gct_path)
        out_gct = parse(out_path)

        pd.util.testing.assert_frame_equal(e_gct.data_df, out_gct.data_df)
        pd.util.testing.assert_frame_equal(e_gct.row_metadata_df,
                                           out_gct.row_metadata_df)
        pd.util.testing.assert_frame_equal(e_gct.col_metadata_df,
                                           out_gct.col_metadata_df)

        # Clean up
        os.remove(out_path)
Example #2
0
    def test_main(self):
        test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                     "test_sip_in_test.gct")
        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct")
        out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct")

        args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {}".format(
            test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname",
            "pert_iname")
        args = sip.build_parser().parse_args(args_string.split())

        # Run main method
        sip.main(args)

        # Compare the output of main with the expected output
        e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                  "test_sip_expected_conn.gct")
        e_out_gct = parse(e_out_path)
        out_gct = parse(out_path)

        self.assertTrue(
            np.allclose(e_out_gct.data_df.values, out_gct.data_df.values),
            ("\ne_out_gct.data_df:\n{}\nout_gct.data_df:\n{}".format(
                e_out_gct.data_df, out_gct.data_df)))
        self.assertTrue(
            e_out_gct.row_metadata_df.equals(out_gct.row_metadata_df),
            ("\ne_out_gct.row_metadata_df:\n{}\nout_gct.row_metadata_df:\n{}".
             format(e_out_gct.row_metadata_df, out_gct.row_metadata_df)))
        self.assertTrue(
            e_out_gct.col_metadata_df.equals(out_gct.col_metadata_df),
            ("\ne_out_gct.col_metadata_df:\n{}\nout_gct.col_metadata_df:\n{}".
             format(e_out_gct.col_metadata_df, out_gct.col_metadata_df)))

        # Remove the created file
        os.remove(out_path)
Example #3
0
def main(args):

    # Parse input gcts
    external_gct = parse(args.external_gct_path,
                         convert_neg_666=False,
                         make_multiindex=True)
    internal_gct = parse(args.internal_gct_path,
                         convert_neg_666=False,
                         make_multiindex=True)
    bg_gct = parse(args.bg_gct_path,
                   convert_neg_666=False,
                   make_multiindex=True)

    # Meat of the script
    (sim_gct, conn_gct) = do_steep_and_sip(
        external_gct, internal_gct, bg_gct, args.similarity_metric,
        args.connectivity_metric,
        args.fields_to_aggregate_for_external_profiles,
        args.fields_to_aggregate_for_internal_profiles)

    # Write output gcts
    wg.write(sim_gct,
             args.out_steep_name,
             data_null="NaN",
             metadata_null="NaN",
             filler_null="NaN")
    wg.write(conn_gct,
             args.out_sip_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
Example #4
0
    def setUpClass(cls):
        external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_external.gct")
        cls.external_gct = parse(external_gct_path, convert_neg_666=False, make_multiindex=True)

        internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_internal.gct")
        cls.internal_gct = parse(internal_gct_path, convert_neg_666=False, make_multiindex=True)

        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_external_query_bg.gct")
        cls.bg_gct = parse(bg_gct_path, convert_neg_666=False, make_multiindex=True)
Example #5
0
File: steep.py Project: yuanjun/psp
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    if os.path.splitext(args.out_name)[1] == ".gct":
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    elif os.path.splitext(args.out_name)[1] == ".gctx":
        wgx.write(out_gct, args.out_name)
    else:
        raise (Exception(
            "out_name must end in .gct or .gctx. out_name: {}".format(
                args.out_name)))
Example #6
0
    def setUpClass(cls):
        external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                         "test_external_query_external.gct")
        cls.external_gct = parse(external_gct_path)

        internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                         "test_external_query_internal.gct")
        cls.internal_gct = parse(internal_gct_path)

        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                   "test_external_query_bg.gct")
        cls.bg_gct = parse(bg_gct_path)
Example #7
0
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path,
                     convert_neg_666=False,
                     make_multiindex=True)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    wg.write(out_gct,
             args.out_name,
             data_null="NaN",
             metadata_null="NA",
             filler_null="NA")
Example #8
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)

        # Write to GCT or GCTX depending on extension
        if str.lower(os.path.splitext(full_out_name)[1]) == ".gct":
            wg.write(gct,
                     full_out_name,
                     data_null="NaN",
                     metadata_null="NA",
                     filler_null="NA")
        elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx":
            wgx.write(gct, full_out_name)
        else:
            raise (Exception(
                "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}"
                .format((args.out_name_suffix))))
Example #9
0
def main(args):

    # Parse gct file
    gct = parse(args.path_to_gct)

    # Parse mapping tsv file
    mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0)

    # Make sure the ids from the mapping file are unique
    duplicated_bool_array = mapping.index.duplicated()
    assert sum(duplicated_bool_array) == 0, (
        "ids in mapping file must be unique. duplicated ids in mapping:\n{}".
        format(mapping.index[duplicated_bool_array]))

    for col in mapping.columns:

        if args.row_and_or_col == "both":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "row":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "col":
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)

    wg.write(gct,
             args.out_name,
             filler_null="NA",
             data_null="NaN",
             metadata_null="NA")
Example #10
0
def main(args):

    # Import data
    assert os.path.exists(
        args.in_gct_path), ("in_gct_path could not be found: {}").format(
            args.in_gct_path)
    in_gct = parse(args.in_gct_path)

    # First, check if any rows are all NaN; if so, remove them
    dropped_df = in_gct.data_df.dropna(how="all")
    bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values)
    in_gct = sg.slice_gctoo(in_gct, row_bool=bools_of_remaining)

    if args.replace_with == "zero":
        in_gct.data_df.fillna(0, inplace=True)

    elif args.replace_with == "median":
        probe_medians = in_gct.data_df.median(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_medians[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    elif args.replace_with == "mean":
        probe_means = in_gct.data_df.mean(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_means[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    wg.write(in_gct, args.out_name, filler_null="NA")
Example #11
0
def read_gct_and_config_file(gct_path, config_path):
    """Read gct and config file.

    The config file has three sections: io, metadata, and parameters.
    These are returned as dictionaries.

    Args:
        gct_path (string): filepath to gct file
        config_path (string): filepath to config file

    Returns:
        gct (GCToo object)
        config_io (dictionary)
        config_metadata (dictionary)
        config_parameters (dictionary)
    """
    assert os.path.exists(os.path.expanduser(config_path))

    # Read config file
    config_parser = ConfigParser.RawConfigParser()
    config_parser.read(os.path.expanduser(config_path))

    # Return config fields as dictionarires
    config_io = dict(config_parser.items("io"))
    config_metadata = dict(config_parser.items("metadata"))
    config_parameters = dict(config_parser.items("parameters"))

    # Parse the gct file and return GCToo object
    gct = parse(gct_path)

    return gct, config_io, config_metadata, config_parameters
def save_drug_dataset(only_landmark_genes=False):
    expression_df = parse(gctx_path).data_df.transpose()
    label_df = pd.DataFrame.from_csv(label_path)
    signatures_df = pd.DataFrame.from_csv(sig_path, sep='\t')
    gene_info_df = pd.DataFrame.from_csv(gene_path, sep='\t')
    print "Expression DataFrame Shape:", expression_df.shape
    if only_landmark_genes:
        del_gene_list = []
        landmark_genes = gene_info_df.loc[gene_info_df['pr_is_lm'] ==
                                          1].index.values.tolist()
        for gene_id in expression_df.columns.values.tolist():
            if gene_id not in landmark_genes:
                del_gene_list.append(gene_id)
        expression_df = expression_df.drop(del_gene_list, axis=1)
        print "Expression DataFrame Only Landmark Genes Shape:", expression_df.shape

    drug_expression, drug_perturbations = get_drug_data(
        expression_df, signatures_df, label_df)
    label_df = get_drug_labels(drug_perturbations.keys(), label_df)
    dataset = dict()
    dataset["drug_expression"] = drug_expression
    dataset["drug_perturbations"] = drug_perturbations
    dataset["label_df"] = label_df
    with open(pickle_all_data_path, 'wb') as handle:
        pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return drug_perturbations, drug_expression, label_df
Example #13
0
def main():
    # get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    logger.debug("args:  {}".format(args))

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
    def main():
        #read drug protein interaction data form drugbank.
        protein_drug_list, drug_name_list = FeatureGeneration.read_DrugBank()

        pertID_drugbankID_dict = generate_broadpert_DrugBank_dict()

        #put all pertID with drugBank drug-protein interaction data into a dictionary...
        pertID_with_drugbank_interaction_dict = {}

        selected_drug_protein_list_with_CMap_data = {}

        for keys in pertID_drugbankID_dict:
            if pertID_drugbankID_dict[keys] in drug_name_list:
                pertID_with_drugbank_interaction_dict[keys] = pertID_drugbankID_dict[keys]

            #find the drug-protein pairs with drugs found in CMap...
            for drug_protein_pair in protein_drug_list:
                if pertID_drugbankID_dict[keys] == drug_protein_pair[1]:
                    selected_drug_protein_list_with_CMap_data[keys] = drug_protein_pair

        print "selected_drug_protein_list_with_CMap_data_overlap: " + str(len(selected_drug_protein_list_with_CMap_data))
        sys.exit()

        # a way to generate mol objecy from SMILE string directly...
        m2 = Chem.MolFromSmiles('C1CCC1')
        Mogen2_matrix = FeatureGeneration.generate_fingerprint("Morgan2", [m2])

        # play with GEO dataset..
        sig_info = pd.read_csv("GSE92742_Broad_LINCS_sig_info.txt", sep="\t")

        selected_sig_id_list = []
        test = []
        # get the ids for signature IDs for those perturbation drugs in both drug-target interaction pairs and CMap... ~ 2700
        for key in pertID_with_drugbank_interaction_dict:

            selected_sig_id_list.append(sig_info["sig_id"][sig_info["pert_id"] == key])

        gene_info = pd.read_csv("GSE92742_Broad_LINCS_gene_info.txt", sep="\t", dtype=str)

        landmark_gene_row_ids = gene_info["pr_gene_id"][gene_info["pr_is_lm"] == "1"]

        my_col_metadata = parse("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", col_meta_only=True)

        print my_col_metadata
        print type(my_col_metadata)
        print np.shape(my_col_metadata)

        #vorinostat_only_gctoo = parse("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=vorinostat_ids)



        #test = vorinostat_only_gctoo.data_df.as_matrix()
        #print type(test)
        #print np.shape(test)
        #print test


        sys.exit()
Example #15
0
def main(args):

    # Find files
    full_path_wildcard = args.in_dir + args.file_wildcard
    gct_paths = glob.glob(full_path_wildcard)

    assert len(gct_paths) > 1, "full_path_wildcard: {}".format(
        full_path_wildcard)

    # Extract prefixes in order to use them later for saving
    prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0]
                for path in gct_paths]

    for path, prefix in zip(gct_paths, prefixes):
        print "path: {}".format(path)
        print "prefix: {}".format(prefix)

    # Import gcts
    gctoos = [parse(x) for x in gct_paths]

    assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths)

    # Compute & save ranks
    for g, prefix in zip(gctoos, prefixes):

        # Extract data_df
        score_df = g.data_df

        # Must be square
        assert score_df.shape[0] == score_df.shape[
            1], "Input dataframe must be square."

        # Set diagonal to NaN
        np.fill_diagonal(score_df.values, np.nan)

        # Rank the matrix (percentile score or not)
        if args.do_percentile_rank:
            rank_df = score_df.rank(ascending=False, pct=True) * 100
        else:
            rank_df = score_df.rank(ascending=False)

        # Make a GCToo
        rank_gctoo = GCToo.GCToo(data_df=rank_df,
                                 row_metadata_df=g.row_metadata_df,
                                 col_metadata_df=g.col_metadata_df)

        # Save the rank_df to file
        out_name = args.out_dir + prefix + args.output_suffix
        wg.write(rank_gctoo,
                 out_name,
                 filler_null="NaN",
                 data_null="NaN",
                 metadata_null="NaN")
def get_dataset():
    expression_df = parse(gctx_path).data_df.transpose()
    label_df = pd.DataFrame.from_csv(label_path)
    sig_info_df = pd.DataFrame.from_csv(sig_path, sep='\t')
    print "Expression shape:", expression_df.shape
    label_pert_ids = label_df.index.values
    del_sig_list = []
    # fill list with perturbation ids to be deleted
    for sig_id in expression_df.index.values:
        if sig_info_df.loc[sig_id, "pert_id"] not in label_pert_ids:
            del_sig_list.append(sig_id)
    # delete perturbations from data frame
    expression_df = expression_df.drop(del_sig_list)
    expression_df = expression_df.sample(frac=1)
    # collect signature side effect labels
    sig_label_data = []
    for sig_id in expression_df.index.values:
        pert_id = sig_info_df.loc[sig_id, "pert_id"]
        sig_labels = label_df.loc[pert_id]
        sig_label_data.append(sig_labels.values)

    sig_label_df = pd.DataFrame(data=sig_label_data,
                                index=expression_df.index.values,
                                columns=label_df.columns.values)

    print "Before column pruning y shape:", sig_label_df.shape
    adr_names = list(sig_label_df)
    del_adr_names = list()
    for adr_name in adr_names:
        if np.sum(sig_label_df.loc[:, adr_name].values) < prune_count:
            del_adr_names.append(adr_name)

    sig_label_df = sig_label_df.drop(del_adr_names, axis=1)
    print "After column pruning y shape:", sig_label_df.shape

    train_cnt = int(floor(expression_df.shape[0] * train_size))
    x_train = expression_df.iloc[0:train_cnt]
    y_train = sig_label_df.iloc[0:train_cnt]
    x_test = expression_df.iloc[train_cnt:]
    y_test = sig_label_df.iloc[train_cnt:]

    print "Before train/test column pruning y shape:", y_train.shape, y_test.shape
    adr_names = list(sig_label_df)
    del_adr_names = list()
    for adr_name in adr_names:
        if np.sum(y_train.loc[:, adr_name].values) < 1 or np.sum(
                y_test.loc[:, adr_name].values) < 1:
            del_adr_names.append(adr_name)

    y_train = y_train.drop(del_adr_names, axis=1)
    y_test = y_test.drop(del_adr_names, axis=1)
    print "After train/test column pruning y shape:", y_train.shape, y_test.shape
    return x_train, y_train, x_test, y_test
Example #17
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True)

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_df, bg_df) = prepare_multi_index_dfs(
        test_gct.multi_index_df, bg_gct.multi_index_df,
        args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct,
        QUERY_FIELD_NAME,
        TARGET_FIELD_NAME,
        args.separator)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df)

    # Compute connectivity
    (conn_mi_df, signed_conn_mi_df) = compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        args.connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = (
        GCToo.multi_index_df_to_component_dfs(
            signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME))

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)

    # Create gct and write it to file
    conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df)
    wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
Example #18
0
    def test_main(self):
        test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                     "test_sip_in_test.gct")
        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct")
        out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct")

        args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {} -s {}".format(
            test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname",
            "pert_iname", "|")
        args = sip.build_parser().parse_args(args_string.split())

        # Run main method
        sip.main(args)

        # Compare the output of main with the expected output
        e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                  "test_sip_expected_conn.gct")
        e_out_gct = parse(e_out_path)
        out_gct = parse(out_path)

        logger.debug("e_out_gct.data_df:\n{}".format(e_out_gct.data_df))
        logger.debug("out_gct.data_df:\n{}".format(out_gct.data_df))
        pd.util.testing.assert_frame_equal(e_out_gct.data_df, out_gct.data_df)

        logger.debug("e_out_gct.row_metadata_df:\n{}".format(
            e_out_gct.row_metadata_df))
        logger.debug("out_gct.row_metadata_df:\n{}".format(
            out_gct.row_metadata_df))
        pd.util.testing.assert_frame_equal(e_out_gct.row_metadata_df,
                                           out_gct.row_metadata_df)

        logger.debug("e_out_gct.col_metadata_df:\n{}".format(
            e_out_gct.col_metadata_df))
        logger.debug("out_gct.col_metadata_df:\n{}".format(
            out_gct.col_metadata_df))
        pd.util.testing.assert_frame_equal(e_out_gct.col_metadata_df,
                                           out_gct.col_metadata_df)

        # Remove the created file
        os.remove(out_path)
Example #19
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.data_df, bg_gct.data_df)

    # Create an aggregated metadata field in test and background GCTs
    # that will be used to aggregate replicates
    (test_gct, bg_gct) = create_aggregated_fields_in_GCTs(
        test_gct, bg_gct, args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME,
        TARGET_FIELD_NAME, args.separator)

    # Compute connectivity
    (conn_gct, signed_conn_gct) = compute_connectivities(
        test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME,
        TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym,
        args.separator)

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df,
                                        args.connectivity_metric,
                                        CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df,
                                        args.connectivity_metric,
                                        CONNECTIVITY_METRIC_FIELD)

    # Write signed result to file
    wg.write(signed_conn_gct,
             args.out_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
Example #20
0
    def load(self):
        """
        Calls the cmapPy gctx parser, retrieves matrix and metadata

            returns: None
        """
        self.data = GEX.parse(self.path).data_df

        ##Dealing with cmapPy data type instability:
        rows = list(map(lambda x: x[2:-1], list(self.data.index)))
        self.data.index = rows
        columns = list(map(lambda x: x[2:-1], list(self.data)))
        self.data.columns = columns
Example #21
0
def main(args):

    gct = parse(args.in_gct_path)

    (_, conn_gct) = do_steep_and_sip(gct, args.similarity_metric,
                                     args.connectivity_metric,
                                     args.fields_to_aggregate)

    # Write output gct
    wg.write(conn_gct,
             args.out_sip_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
Example #22
0
def main(args):

    # Import data
    in_gct = parse(args.in_gct_path)

    # Compute distance df
    dist_df = 1 - in_gct.data_df

    # Create distance gct
    dist_gct = GCToo.GCToo(dist_df, in_gct.row_metadata_df,
                           in_gct.col_metadata_df)

    # Write dist_gct to file
    wg.write(dist_gct, args.out_name, filler_null="NA")
Example #23
0
	def test_main1(self):
		input_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
		                              "test_introspect_main.gct")
		output_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
		                               "test_introspect_main_out.gct")
		expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
		                                 "test_introspect_main_expected.gct")

		args_string = "-i {} -o {} -fa chd1".format(input_gct_path, output_gct_path)
		args = introspect.build_parser().parse_args(args_string.split())

		introspect.main(args)

		# Read in output and expected gcts and confirm that they're equal
		output_gct = parse(output_gct_path)
		expected_gct = parse(expected_gct_path)

		pd.util.testing.assert_almost_equal(expected_gct.data_df, output_gct.data_df, check_less_precise=2)
		pd.testing.assert_frame_equal(expected_gct.row_metadata_df, output_gct.row_metadata_df)
		pd.testing.assert_frame_equal(expected_gct.col_metadata_df, output_gct.col_metadata_df)

		# Clean up
		os.remove(output_gct_path)
Example #24
0
def read_reduced():
    """
    Reads in the reduced file outputs a data_frame with the maximal magnitude signature for each pert_id
    :return reduce l1000 feature dataframe (as pandas dataframe)
    """
    ### read in the reduced data
    reduced_data = parse(join(FILE_PATH, "lm_sm_aggz.gctx"))

    ### read in the signature info and set the index to the signature id for easy indexing in the next step
    sig_info = pd.read_csv(join(FILE_PATH,
                                "GSE92742_Broad_LINCS_sig_info.txt"),
                           sep="\t")
    sig_info.index = sig_info['sig_id']

    ### map the columns to the pert_id that generated the signature to allow for comparison in spark
    reduced_data.data_df.columns = sig_info.loc[pd.Index(
        reduced_data.data_df.columns)]['pert_id']
    ### return data_frame with pert_ids in row_major form ready for scala
    return reduced_data.data_df.transpose()
Example #25
0
def main(args):

    # Parse gct
    gct = parse(args.input_gct_path)

    # TODO(LL): better integrate main_sym and main_asym

    # Figure out whether or not the gct is symmetric
    if gct.row_metadata_df.equals(gct.col_metadata_df):
        logger.info(("Row metadata equals column metadata. " +
                     "Assuming symmetric GCT."))

        assert args.row_annot_fields == args.col_annot_fields, (
            ("row_annot_fields should be the same as col_annot_fields if the " +
             "GCT is symmetric. args.row_annot_fields: {}, " +
             "args.col_annot_fields: {}").format(args.row_annot_fields,
                                                 args.col_annot_fields))

        assert args.query_in_row_or_col is None, (
            ("query_in_row_or_col should be None for symmetric GCTs. " +
             "args.query_in_row_or_col: {}").format(args.query_in_row_or_col))

        # Main method for symmetric gcts
        main_sym(gct, args.out_fig_name, args.out_gml_name,
                 args.row_annot_fields, args.my_query, args.query_field,
                 args.threshold, args.percentile, args.vertex_label_field,
                 args.vertex_color_field, layout=LAYOUT)

    else:
        logger.info(("Row metadata does not equal column metadata. " +
                     "Assuming asymmetric GCT."))

        assert args.query_in_row_or_col != "both", (
            ("query_in_row_or_col must not be 'both' if the matrix is " +
             "asymmetric. args.query_in_row_or_col: {}").format(
                args.query_in_row_or_col))

        # Main method for asymmetric gcts
        main_asym(gct, args.out_fig_name, args.out_gml_name,
                  args.row_annot_fields, args.col_annot_fields,
                  args.my_query, args.query_field, args.query_in_row_or_col,
                  args.threshold, args.percentile, args.vertex_label_field,
                  args.vertex_color_field)
Example #26
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)
        wg.write(gct,
                 full_out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
Example #27
0
def reduce_and_save():
    """
    Reads in the level 5 data and outputs a file with only the landmark gene z-scores(rows and the small molecule
     perterbagens (cols)
    """
    ### Get the signature information
    sig_info = pd.read_csv(join(FILE_PATH,
                                "GSE92742_Broad_LINCS_sig_info.txt"),
                           sep="\t")
    ### Columns are:
    ###  Index([u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id',
    ###       u'pert_dose', u'pert_dose_unit', u'pert_idose', u'pert_time',
    ###       u'pert_time_unit', u'pert_itime', u'distil_id'],
    ###      dtype='object')

    ### Filter for signature ids for small molecule pertubagens
    small_mol_sigs = sig_info['sig_id'][sig_info['pert_type'] == "trt_cp"]
    ### Results in 205034 signatures

    ### Read in the gene info
    gene_info = pd.read_csv(join(FILE_PATH,
                                 "GSE92742_Broad_LINCS_gene_info.txt"),
                            sep='\t')
    ### Index([u'pr_gene_id', u'pr_gene_symbol', u'pr_gene_title', u'pr_is_lm',
    ###      u'pr_is_bing'],
    ###      dtype='object')

    landmark_gene_ids = gene_info['pr_gene_id'][
        gene_info['pr_is_lm'] == 1]  #Filters for directly measured transcripts
    ### Results in the 978 landmark pr_gene_ids

    ### LOAD in the main file filtering the columns so that only the small molecules signatures are loaded and the
    ### rows such that only the landmark genes are loaded into their custom gctoo container type
    relevent_sigs_gctoo = parse(join(
        FILE_PATH,
        "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"),
                                cid=small_mol_sigs,
                                rid=landmark_gene_ids)
    # print small_mol_sigs.data_df.shape
    ### Should write an intermediate file with dimensions (978, 205034)
    write_gctx.write(relevent_sigs_gctoo, join(FILE_PATH, "lm_sm_aggz"))
Example #28
0
    def test_gct_parsing(self):
        # parse in gct, no other arguments
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse("functional_tests/mini_gctoo_for_testing.gct")

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse("functional_tests/mini_gctoo_for_testing.gct",
                        convert_neg_666=False)
        self.assertFalse(
            mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())

        # check unused rid argument handling
        with self.assertRaises(Exception) as context:
            mg3 = parse("functional_tests/mini_gctoo_for_testing.gct",
                        rid=["a"])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused cid argument handling
        with self.assertRaises(Exception) as context:
            mg4 = parse("functional_tests/mini_gctoo_for_testing.gct",
                        cid=["a"])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused ridx argument handling
        with self.assertRaises(Exception) as context:
            mg5 = parse("functional_tests/mini_gctoo_for_testing.gct",
                        ridx=[0])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused cidx argument handling
        with self.assertRaises(Exception) as context:
            mg6 = parse("functional_tests/mini_gctoo_for_testing.gct",
                        cidx=[0])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))
Example #29
0
def main(args):
    # Import gct
    gct = parse(args.gct_file_path)

    # Get plate and well names
    (plate_names,
     well_names) = extract_plate_and_well_names(gct.col_metadata_df,
                                                args.plate_field,
                                                args.well_field)

    # Extract provenance code
    prov_code = utils.extract_prov_code(gct.col_metadata_df, PROV_CODE_FIELD,
                                        PROV_CODE_DELIMITER)

    # If data has been log-transformed, undo it
    unlogged_df = undo_log_transform_if_needed(gct.data_df, prov_code)

    # Divide by the maximum value for the row
    max_row_values = unlogged_df.max(axis='columns')
    divided_df = unlogged_df.div(max_row_values, axis="rows")

    # Calculate metrics for each sample
    medium_over_heavy_medians = divided_df.median(axis=0).values
    medium_over_heavy_means = divided_df.mean(axis=0).values
    medium_over_heavy_mads = divided_df.mad(axis=0).values
    medium_over_heavy_sds = divided_df.std(axis=0).values

    # Assemble plate_names, well_names, and metrics into a dataframe
    out_df = assemble_output_df(
        plate_names, well_names, {
            "medium_over_heavy_median": medium_over_heavy_medians,
            "medium_over_heavy_mad": medium_over_heavy_mads
        })

    # Write to pw file
    out_df.to_csv(args.out_pw_file_path, sep="\t", na_rep="NaN", index=False)
    logger.info("PW file written to {}".format(args.out_pw_file_path))
Example #30
0
import cmapPy.pandasGEXpress.parse as parse
import broadinstitute_psp.utils.separate_gct as sg
import broadinstitute_psp.utils.setup_logger as setup_logger

logger = logging.getLogger(setup_logger.LOGGER_NAME)

functional_tests_dir = "utils/functional_tests/"

in_gct_path = functional_tests_dir + "test_separate_in.gct"
thing1_gct_path = functional_tests_dir + "test_separate_expected_thing1.gct"
thing2_gct_path = functional_tests_dir + "test_separate_expected_thing2.gct"
a375_gct_path = functional_tests_dir + "test_separate_expected_A375.gct"
ht29_gct_path = functional_tests_dir + "test_separate_expected_HT29.gct"
a549_gct_path = functional_tests_dir + "test_separate_expected_A549.gct"

in_gct = parse(in_gct_path)
thing1_gct = parse(thing1_gct_path)
thing2_gct = parse(thing2_gct_path)
a375_gct = parse(a375_gct_path)
ht29_gct = parse(ht29_gct_path)
a549_gct = parse(a549_gct_path)


class TestSeparateGct(unittest.TestCase):
    def test_separate_row(self):
        (thing_gcts, thing_fields) = sg.separate(in_gct, "thing", "row")

        self.assertListEqual(thing_fields, [1, 2])

        pd.util.testing.assert_frame_equal(thing_gcts[0].data_df,
                                           thing1_gct.data_df)