コード例 #1
0
    def test_check_df(self):
        not_unique_data_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
                                          index=["A", "B"],
                                          columns=["a", "b", "a"])
        not_unique_rhd = pd.DataFrame([["rhd_A", "rhd_B"], ["rhd_C", "rhd_D"]],
                                      index=["A", "B"],
                                      columns=["rhd1", "rhd1"])
        """
        # case 3: row subsetting - sample subset > og # of samples
        with self.assertRaises(AssertionError) as context:
            random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row")
        self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception))

        """
        # cids in data_df are not unique
        with self.assertRaises(Exception) as context:
            GCToo.GCToo(data_df=not_unique_data_df,
                        row_metadata_df=pd.DataFrame(index=["A", "B"]),
                        col_metadata_df=pd.DataFrame(index=["a", "b", "c"]))
            print(str(not_unique_data_df.columns))
            self.assertTrue(
                str(not_unique_data_df.columns) in str(context.exception))

        # rhds are not unique in row_metadata_df
        with self.assertRaises(Exception) as context:
            GCToo.GCToo(data_df=pd.DataFrame([[1, 2, 3], [4, 5, 6]],
                                             index=["A", "B"],
                                             columns=["a", "b", "c"]),
                        row_metadata_df=not_unique_rhd,
                        col_metadata_df=pd.DataFrame(index=["a", "b", "c"]))
            self.assertTrue("'rhd1' 'rhd1'" in str(context.exception))
コード例 #2
0
    def setUpClass(cls):
        cls.sym_data_df = pd.DataFrame(
            [[0.9, 0.4, 0.6], [0.4, 1.0, -0.3], [0.6, -0.3, 1.0]],
            index=["a", "b", "c"], columns=["a", "b", "c"])
        cls.sym_meta_df = pd.DataFrame(
            [["A375", "great"], ["A375", "bad"], ["A375", "ok"]],
            index=["a", "b", "c"], columns=["cell_id", "pert_type"])
        cls.sym_gct = GCToo.GCToo(cls.sym_data_df, cls.sym_meta_df, cls.sym_meta_df)

        cls.asym_data_df = pd.DataFrame(
            [[0.1, 0.4], [-0.7, -0.1], [np.nan, 0.9]],
            index=["A", "B", "C"], columns=["o", "k"])
        cls.asym_row_meta_df = pd.DataFrame(
            ["3h", "1h", "2h"], index=["A", "B", "C"], columns=["pert_time"])
        cls.asym_col_meta_df = pd.DataFrame(
            [["MCF7", "3h"], ["A375", "6h"]], index=["o", "k"],
            columns=["cell_id", "pert_time"])
        cls.asym_gct = GCToo.GCToo(cls.asym_data_df, cls.asym_row_meta_df, cls.asym_col_meta_df)

        cls.sym_g = ig.Graph()
        cls.sym_g.add_vertices(3)
        cls.sym_g.add_edges([(0, 1), (0, 2), (1, 2)])
        cls.sym_g.vs["id"] = ["a", "b", "c"]
        cls.sym_g.vs["cell_id"] = ["A375", "A375", "A375"]
        cls.sym_g.vs["pert_type"] = ["great", "bad", "ok"]
        cls.sym_g.es["weight"] = [0.4, 0.6, -0.3]

        cls.asym_g = ig.Graph()
        cls.asym_g.add_vertices(5)
        cls.asym_g.add_edges([(0, 3), (0, 4), (1, 3), (1, 4), (2, 3), (2, 4)])
        cls.asym_g.vs["id"] = ["A", "B", "C", "o", "k"]
        cls.asym_g.vs["type"] = [False, False, False, True, True]
        cls.asym_g.vs["cell_id"] = [None, None, None, "MCF7", "A375"]
        cls.asym_g.vs["pert_time"] = ["3h", "1h", "2h", "3h", "6h"]
        cls.asym_g.es["weight"] = [0.1, 0.4, -0.7, -0.1, np.nan, 0.9]
コード例 #3
0
def ssmd_ecdf(norm_gct, median_gct, title, outfile):

    norm_ssmd = pd.Series()
    median_ssmd = pd.Series()


    for rep in norm_gct.col_metadata_df['prism_replicate'].unique():
        norm_temp = norm_gct.data_df[norm_gct.col_metadata_df[norm_gct.col_metadata_df['prism_replicate'] == rep].index]
        norm_temp_col = norm_gct.col_metadata_df.loc[norm_gct.col_metadata_df['prism_replicate'] == rep]
        temp_norm_ssmd = get_ssmd(GCToo.GCToo(data_df=norm_temp, col_metadata_df=norm_temp_col, row_metadata_df=norm_gct.row_metadata_df),
                             unlog=True)
        med_temp = median_gct.data_df[median_gct.col_metadata_df[median_gct.col_metadata_df['prism_replicate'] == rep].index]
        med_temp_col = median_gct.col_metadata_df.loc[median_gct.col_metadata_df['prism_replicate'] == rep]
        temp_median_ssmd = get_ssmd(GCToo.GCToo(data_df=med_temp, col_metadata_df=med_temp_col, row_metadata_df=median_gct.row_metadata_df))
        norm_ssmd = norm_ssmd.append(temp_norm_ssmd)
        median_ssmd = median_ssmd.append(temp_median_ssmd)

    norm_ecdf = ECDF(norm_ssmd)
    med_ecdf = ECDF(median_ssmd)

    plt.plot(norm_ecdf.x, norm_ecdf.y, label='NORM')
    plt.plot(med_ecdf.x, med_ecdf.y, label='MFI')
    plt.xlim(-1,10)
    plt.xlabel('SSMD Values')
    plt.title(title)
    axes = plt.gca()
    axes.legend(bbox_to_anchor=(.615, 0.81, 0.8, .6), loc=3, borderaxespad=0.)
    plt.savefig(os.path.join(outfile, 'SSMD_ECDF.png'))
コード例 #4
0
ファイル: introspect.py プロジェクト: karenchris/psp
def do_steep_and_sip(gct, similarity_metric, connectivity_metric, fields_to_aggregate):
	""" Perform steep and sip on the same GCT. AKA introspect.

	Args:
	    gct:
	    similarity_metric:
	    connectivity_metric:
	    fields_to_aggregate:

	Returns:
	    sim_gct
	    conn_gct

	"""

	#----------STEEP--------#

	sim_df = steep.compute_similarity_within_df(gct.data_df, similarity_metric)

	# Row and column metadata are both from gct
	metadata_df = gct.col_metadata_df

	# Append column to metadata_df indicating which similarity_metric was used
	metadata_df[SIMILARITY_METRIC_FIELD] = similarity_metric

	# Assemble similarity gct
	sim_gct = GCToo.GCToo(data_df=sim_df, row_metadata_df=metadata_df,
	                      col_metadata_df=metadata_df)

	#----------SIP----------#

	# Check symmetry
	(is_test_df_sym, _) = sip.check_symmetry(sim_gct.data_df, sim_gct.data_df)

	# Create deep copies of sim_gct in order to leave the original GCT untouched
	test_gct = GCToo.GCToo(data_df=sim_df.copy(deep=True),
	                       row_metadata_df=metadata_df.copy(deep=True),
	                       col_metadata_df=metadata_df.copy(deep=True))
	bg_gct = GCToo.GCToo(data_df=sim_df.copy(deep=True),
	                     row_metadata_df=metadata_df.copy(deep=True),
	                     col_metadata_df=metadata_df.copy(deep=True))

	# Create an aggregated metadata field for index and columns of sim_gct
	# and sort by that field
	(test_gct, bg_gct) = sip.create_aggregated_fields_in_GCTs(
		test_gct, bg_gct, fields_to_aggregate, fields_to_aggregate,
		fields_to_aggregate, QUERY_FIELD_NAME, TARGET_FIELD_NAME, SEPARATOR)

	# Compute connectivity
	(_, signed_conn_gct) = sip.compute_connectivities(
		test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
		connectivity_metric, is_test_df_sym, SEPARATOR)

	# Append to queries a new column saying what connectivity metric was used
	sip.add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD)
	sip.add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD)

	return sim_gct, signed_conn_gct
コード例 #5
0
ファイル: steep.py プロジェクト: yuanjun/psp
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    if os.path.splitext(args.out_name)[1] == ".gct":
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    elif os.path.splitext(args.out_name)[1] == ".gctx":
        wgx.write(out_gct, args.out_name)
    else:
        raise (Exception(
            "out_name must end in .gct or .gctx. out_name: {}".format(
                args.out_name)))
コード例 #6
0
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path,
                     convert_neg_666=False,
                     make_multiindex=True)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    wg.write(out_gct,
             args.out_name,
             data_null="NaN",
             metadata_null="NA",
             filler_null="NA")
コード例 #7
0
def main(args, proj_dir, out_dir, project_name,invar=True):

    # Read in the data
    data_map, metadata_map = read_build_data(proj_dir=proj_dir)

    # Make folders for different outputs
    mk_folders(out_dir=out_dir, folders=['sensitivities', 'distributions', 'heatmaps'])

    # Check if project name arg is filled, if not use base folder name
    if project_name is None:
        project_name = os.path.basename(os.path.dirname(proj_dir))

    # Make distributions and heatmaps of all data at each data level
    mk_distributions(data_map, metadata_map, project_name, out_dir)

    # If expected sensitivities arg is set to true, run expected sensitivities analysis
    # TODO add argument for defining sensitivity cell set

    # Make standard SC plot for whole dataset, signal strength vs correlation
    prism_plots.sc_plot(metadata_map['sig'], os.path.join(out_dir,'sc_modz.zspc.png'))

    # Make modz distribuions split by pert type
    if 'combat_modz' in data_map.keys():
        comp.modz_dist(data_map['combat_modz'], metadata_map['cb_sig'], [], os.path.join(out_dir, 'modz_dist.png'))

    # If running on data with control barcodes, plot monotonicity of curves
    #if invar is True:
    #    inv.invariant_monotonicity(data_map['mfi'], metadata_map['inst'], out_dir)

    # Calculate median SSMD by pool and output in table
    ssmd.ssmd_by_pool(metadata_map['ssmd'], metadata_map['cell'], out_dir)

    # Get cell metadata without control barcodes for later use
    norm_cell_metadata = metadata_map['cell'].loc[[x for x in metadata_map['cell'].index if x in data_map['norm'].row_metadata_df.index]]

    # ECDF of SSMD Scores in Norm Data and MFI Data
    if 'norm' and 'mfi' in data_map.keys():
        ssmd.ssmd_ecdf(GCToo.GCToo(data_df=data_map['norm'].data_df, col_metadata_df=metadata_map['inst'].loc[data_map['norm'].data_df.columns],
                               row_metadata_df=norm_cell_metadata),
                   GCToo.GCToo(data_df=data_map['mfi'].data_df,
                               col_metadata_df=metadata_map['inst'].loc[data_map['mfi'].data_df.columns],
                               row_metadata_df=metadata_map['cell']), 'SSMD ECDF for {}'.format(os.path.dirname(proj_dir))
                   ,os.path.join(out_dir))

    # Make a bunch of plots at the plate level for each plate in cohort
    if args.plate_qc:
        get_plate_qc_data_map_and_run(data_map, metadata_map, norm_cell_metadata, project_name, out_dir, invar)
    #todo: add a check for get_plate_qc_data_map_and_run before running qc_galleries --> dependent
        qc_galleries(out_dir, project_name, metadata_map, data_map)
コード例 #8
0
    def test_multi_index_df_to_component_dfs(self):
        mi_df_index = pd.MultiIndex.from_arrays(
            [["D", "E"], [-666, -666], ["dd", "ee"]],
            names=["rid", "rhd1", "rhd2"])
        mi_df_columns = pd.MultiIndex.from_arrays(
            [["A", "B", "C"], [1, 2, 3], ["Z", "Y", "X"]],
            names=["cid", "chd1", "chd2"])
        mi_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                             index=mi_df_index,
                             columns=mi_df_columns)

        e_row_metadata_df = pd.DataFrame([[-666, "dd"], [-666, "ee"]],
                                         index=pd.Index(["D", "E"],
                                                        name="rid"),
                                         columns=pd.Index(["rhd1", "rhd2"],
                                                          name="rhd"))
        e_col_metadata_df = pd.DataFrame([[1, "Z"], [2, "Y"], [3, "X"]],
                                         index=pd.Index(["A", "B", "C"],
                                                        name="cid"),
                                         columns=pd.Index(["chd1", "chd2"],
                                                          name="chd"))
        e_data_df = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                                 index=pd.Index(["D", "E"], name="rid"),
                                 columns=pd.Index(["A", "B", "C"], name="cid"))

        (data_df, row_df,
         col_df) = GCToo.multi_index_df_to_component_dfs(mi_df)

        self.assertTrue(col_df.equals(e_col_metadata_df))
        self.assertTrue(row_df.equals(e_row_metadata_df))
        self.assertTrue(data_df.equals(e_data_df))

        # edge case: if the index (or column) of the multi-index has only one
        # level, it becomes a regular index
        mi_df_index_plain = pd.MultiIndex.from_arrays([["D", "E"]],
                                                      names=["rid"])
        mi_df2 = pd.DataFrame([[1, 3, 5], [7, 11, 13]],
                              index=mi_df_index_plain,
                              columns=mi_df_columns)

        # row df should be empty
        e_row_df2 = pd.DataFrame(index=["D", "E"])

        (data_df2, row_df2,
         col_df2) = GCToo.multi_index_df_to_component_dfs(mi_df2)
        self.assertTrue(row_df2.equals(e_row_df2))
        self.assertTrue(col_df2.equals(e_col_metadata_df))
        self.assertTrue(data_df2.equals(e_data_df))
コード例 #9
0
ファイル: zscore.py プロジェクト: cmap/merino
def calculate_zscore(df, plate_control=False):

    # Calculate level 4 data from level 3

    if plate_control == False:
        neg_dex = df.col_metadata_df[df.col_metadata_df['pert_type'] == 'ctl_vehicle'].index.tolist()
        neg_df = df.data_df[neg_dex]
        zscore_data = zscore(df.data_df, neg_df)
        df.col_metadata_df['data_level'] = 'ZSVC'
        df.col_metadata_df['provenance'] = [x + ' | ZSVC' for x in df.col_metadata_df['provenance']]

    elif plate_control == True:
        zscore_data = zscore(df.data_df)
        df.col_metadata_df['data_level'] = 'ZSPC'
        df.col_metadata_df['provenance'] = [x + ' | ZSPC' for x in df.col_metadata_df['provenance']]

    row_metadata_df = df.row_metadata_df

    zscore_data[zscore_data < -10] = -10

    zscore_data[zscore_data > 10] = 10

    zscore_data.sort_index(inplace=True)
    row_metadata_df.sort_index(inplace=True)
    zscore_gctoo = GCToo.GCToo(data_df=zscore_data, row_metadata_df=row_metadata_df, col_metadata_df=df.col_metadata_df)

    return zscore_gctoo
コード例 #10
0
ファイル: test_sip.py プロジェクト: karenchris/psp
    def test_extract_bg_vals_from_sym(self):

        bg_meta_df = pd.DataFrame({
            "group": ["A", "B", "A", "B", "C", "C"],
            "id": [1, 2, 3, 4, 5, 6]
        })
        bg_data_df = pd.DataFrame([[1.0, 0.5, 1.0, -0.4, 1.1, -0.6],
                                   [0.5, 1.0, 1.2, -0.8, -0.9, 0.4],
                                   [1.0, 1.2, 1.0, 0.1, 0.3, 1.3],
                                   [-0.4, -0.8, 0.1, 1.0, 0.5, -0.2],
                                   [1.1, -0.9, 0.3, 0.5, 1.0, 0.7],
                                   [-0.6, 0.4, 1.3, -0.2, 0.7, 1.0]])
        bg_gct = GCToo.GCToo(data_df=bg_data_df,
                             row_metadata_df=bg_meta_df,
                             col_metadata_df=bg_meta_df)

        # Expected values
        e_A_vals = [0.5, 1.0, -0.4, 1.1, -0.6, 1.2, 0.1, 0.3, 1.3]
        e_B_vals = [0.5, 1.2, -0.8, -0.9, 0.4, -0.4, 0.1, 0.5, -0.2]
        e_C_vals = [1.1, -0.9, 0.3, 0.5, 0.7, -0.6, 0.4, 1.3, -0.2]

        A_vals = sip.extract_bg_vals_from_sym("A", "group", bg_gct)
        self.assertItemsEqual(e_A_vals, A_vals)

        B_vals = sip.extract_bg_vals_from_sym("B", "group", bg_gct)
        self.assertItemsEqual(e_B_vals, B_vals)

        C_vals = sip.extract_bg_vals_from_sym("C", "group", bg_gct)
        self.assertItemsEqual(e_C_vals, C_vals)

        # Verify that assert statement works
        with self.assertRaises(AssertionError) as e:
            sip.extract_bg_vals_from_sym("D", "group", bg_gct)
        self.assertIn("D is not in the group metadata", str(e.exception))
コード例 #11
0
def vstack(gctoos,
           remove_all_metadata_fields=False,
           error_report_file=None,
           fields_to_remove=[],
           reset_ids=False):
    """ Vertically concatenate gctoos.

    Args:
        gctoos (list of gctoo objects)
        remove_all_metadata_fields (bool):  ignore/strip all common metadata when combining gctoos
        error_report_file (string):  path to write file containing error report indicating 
            problems that occurred during vstack, mainly for inconsistencies in common metadata
        fields_to_remove (list of strings): fields to be removed from the
            common metadata because they don't agree across files
        reset_ids (bool): set to True if row ids are not unique

    Return:
        concated (gctoo object)
    """
    # Separate each gctoo into its component dfs
    row_meta_dfs = []
    col_meta_dfs = []
    data_dfs = []
    srcs = []
    for g in gctoos:
        row_meta_dfs.append(g.row_metadata_df)
        col_meta_dfs.append(g.col_metadata_df)
        data_dfs.append(g.data_df)
        srcs.append(g.src)

    # Concatenate col metadata
    all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove,
                                               srcs,
                                               remove_all_metadata_fields,
                                               error_report_file)

    # Concatenate col metadata
    all_row_metadata_df = assemble_concatenated_meta(
        row_meta_dfs, remove_all_metadata_fields)

    # Concatenate the data_dfs
    all_data_df = assemble_data(data_dfs, "vert")

    # Make sure df shapes are correct
    assert all_data_df.shape[0] == all_row_metadata_df.shape[
        0], "Number of rows is incorrect."
    assert all_data_df.shape[1] == all_col_metadata_df.shape[
        0], "Number of columns is incorrect."

    # If requested, reset sample ids to be unique integers and move old sample
    # ids into column metadata
    if reset_ids:
        do_reset_ids(all_row_metadata_df, all_data_df, "vert")

    logger.info("Build GCToo of all...")
    concated = GCToo.GCToo(row_metadata_df=all_row_metadata_df,
                           col_metadata_df=all_col_metadata_df,
                           data_df=all_data_df)

    return concated
コード例 #12
0
ファイル: test_sip.py プロジェクト: karenchris/psp
    def test_extract_bg_vals_from_non_sym(self):
        bg_row_meta_df = pd.DataFrame({
            "group": ["A", "B", "A", "B"],
            "id": [1, 2, 3, 4]
        })
        bg_col_meta_df = pd.DataFrame({
            "group": ["F", "F", "E", "E"],
            "id": [1, 2, 3, 4]
        })
        bg_data_df = pd.DataFrame([[1, 2, 3, 5], [7, 11, 13, 17],
                                   [19, 23, 29, 31], [-3, 5, 7, 11]])
        bg_gct = GCToo.GCToo(data_df=bg_data_df,
                             row_metadata_df=bg_row_meta_df,
                             col_metadata_df=bg_col_meta_df)

        # Expected values
        e_A_vals = [1, 2, 3, 5, 19, 23, 29, 31]
        e_B_vals = [7, 11, 13, 17, -3, 5, 7, 11]

        A_vals = sip.extract_bg_vals_from_non_sym("A", "group", bg_gct)
        self.assertItemsEqual(e_A_vals, A_vals)

        B_vals = sip.extract_bg_vals_from_non_sym("B", "group", bg_gct)
        self.assertItemsEqual(e_B_vals, B_vals)

        # Verify that assert statement works
        with self.assertRaises(AssertionError) as e:
            sip.extract_bg_vals_from_non_sym("D", "group", bg_gct)
        self.assertIn("target D is not in the group metadata",
                      str(e.exception))
コード例 #13
0
ファイル: test_merino.py プロジェクト: cmap/merino
    def test_count_shear(self):
        count = GCToo.GCToo(
            data_df=pd.DataFrame(
                {
                    'test_CS0_X1:A': [40, 50, 30, 20, 10],
                    'test_CS0_X1:B': [110, 80, 60, 40, 30],
                    'test_CS0_X1:C': [5, 15, 4, 3, 5],
                    'test_CS0_X1:D': [60, 90, 70, 8, 8],
                    'test_CS0_X1:E': [75, 85, 60, 9, 10]
                },
                index=['1', '2', '3', '661', '662']),
            row_metadata_df=pd.DataFrame(index=['1', '2', '3', '661', '662']),
            col_metadata_df=pd.DataFrame(
                {
                    'pert_type': [
                        'trt_cp', 'trt_cp', 'trt_cp', 'ctl_vehicle',
                        'ctl_vehicle'
                    ]
                },
                index=[
                    'test_CS0_X1:A', 'test_CS0_X1:B', 'test_CS0_X1:C',
                    'test_CS0_X1:D', 'test_CS0_X1:E'
                ]))
        shear = norm.remove_low_bead_wells(l, count)

        print shear.data_df.shape

        assert len(shear.data_df.columns) == 4
コード例 #14
0
ファイル: test_dry.py プロジェクト: yuanjun/psp
    def test_insert_offsets_and_prov_code(self):
        data = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                            index=["a", "b", "c", "d"],
                            columns=["e", "f", "g"])
        row_meta = pd.DataFrame(
            [["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]],
            index=["a", "b", "c", "d"],
            columns=["row_field1", "row_field2"])
        col_meta = pd.DataFrame(
            [["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]],
            index=["e", "f", "g"],
            columns=["col_field1", "col_field2"])
        in_gct = GCToo.GCToo(data_df=data,
                             row_metadata_df=row_meta,
                             col_metadata_df=col_meta)
        offsets = np.array([3.0, 5.0, 8.0])
        offsets_field = "offsets"
        prov_code = ["A", "B", "C", "D"]
        prov_code_field = "col_field2"
        prov_code_delimiter = "+"
        e_col_meta = pd.DataFrame(
            [["cm1", "A+B+C+D", 3.0], ["cm3", "A+B+C+D", 5.0],
             ["cm5", "A+B+C+D", 8.0]],
            index=["e", "f", "g"],
            columns=["col_field1", "col_field2", "offsets"])

        out_gct = dry.insert_offsets_and_prov_code(in_gct, offsets,
                                                   offsets_field, prov_code,
                                                   prov_code_field,
                                                   prov_code_delimiter)

        self.assertTrue(np.array_equal(out_gct.col_metadata_df, e_col_meta))
コード例 #15
0
ファイル: test_dry.py プロジェクト: yuanjun/psp
    def test_log_transform_if_needed(self):
        prov_code = ["GR1", "L2X"]
        rids = ["a", "b", "c"]
        cids = ["A", "B", "C"]
        in_df = pd.DataFrame(
            [[10, 3, 1.2], [0.45, 0.2, 0], [4.5, np.nan, 0.3]],
            index=rids,
            columns=cids,
            dtype=float)

        in_gct = GCToo.GCToo(data_df=in_df,
                             row_metadata_df=pd.DataFrame(index=rids),
                             col_metadata_df=pd.DataFrame(index=cids))

        # Nothing should happen
        (out_gct,
         out_prov_code) = dry.log_transform_if_needed(in_gct, prov_code, "L2X")
        self.assertTrue(
            np.allclose(out_gct.data_df, in_df, atol=1e-3, equal_nan=True))
        self.assertEqual(out_prov_code, prov_code)

        # L2X should occur
        prov_code2 = ["GR1"]
        (_,
         out_prov_code2) = dry.log_transform_if_needed(in_gct, prov_code2,
                                                       "L2X")
        self.assertEqual(out_prov_code2, prov_code)

        in_gct.data_df.iloc[0, 1] = -3
        with self.assertRaises(AssertionError) as e:
            (_, _) = dry.log_transform_if_needed(in_gct, prov_code2, "L2X")
        self.assertIn("data_df should not contain negative", str(e.exception))
コード例 #16
0
    def test_assemble_multi_index_df(self):

        # TODO: Add test of only row ids present as metadata
        # TODO: Add test of only col ids present as metadata

        g = GCToo.GCToo(data_df=pd.DataFrame(
            {
                10: range(13, 16),
                11: range(16, 19),
                12: range(19, 22)
            },
            index=range(4, 7)),
                        row_metadata_df=pd.DataFrame({"a": range(3)},
                                                     index=range(4, 7)),
                        col_metadata_df=pd.DataFrame({"b": range(7, 10)},
                                                     index=range(10, 13)),
                        make_multiindex=True)

        assert "a" in g.multi_index_df.index.names, g.multi_index_df.index.names
        assert "rid" in g.multi_index_df.index.names, g.multi_index_df.index.names

        assert "b" in g.multi_index_df.columns.names, g.multi_index_df.columns.names
        assert "cid" in g.multi_index_df.columns.names, g.multi_index_df.columns.names

        r = g.multi_index_df.xs(7, level="b", axis=1)
        logger.debug("r:  {}".format(r))
        assert r.xs(4, level="rid",
                    axis=0).values[0][0] == 13, r.xs(4, level="rid",
                                                     axis=0).values[0][0]
        assert r.xs(5, level="rid",
                    axis=0).values[0][0] == 14, r.xs(5, level="rid",
                                                     axis=0).values[0][0]
        assert r.xs(6, level="rid",
                    axis=0).values[0][0] == 15, r.xs(6, level="rid",
                                                     axis=0).values[0][0]
コード例 #17
0
    def test_main(self):
        out_name = os.path.join(FUNCTIONAL_TESTS_PATH, "test_main_out.gct")

        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)
        wg.write(gctoo,
                 out_name,
                 data_null="NaN",
                 metadata_null="-666",
                 filler_null="-666")

        # Read in the gct and verify that it's the same as gctoo
        new_gct = pg.parse(out_name)

        pd.util.testing.assert_frame_equal(new_gct.data_df, gctoo.data_df)
        pd.util.testing.assert_frame_equal(new_gct.row_metadata_df,
                                           gctoo.row_metadata_df)
        pd.util.testing.assert_frame_equal(new_gct.col_metadata_df,
                                           gctoo.col_metadata_df)

        # Also check that missing values were written to the file as expected
        in_df = pd.read_csv(out_name,
                            sep="\t",
                            skiprows=2,
                            keep_default_na=False)
        self.assertEqual(in_df.iloc[0, 1], "-666")
        self.assertEqual(in_df.iloc[5, 6], "NaN")

        # Cleanup
        os.remove(out_name)
コード例 #18
0
ファイル: dry.py プロジェクト: hrk2109/psp
def log_transform_if_needed(gct, prov_code, prov_code_entry):
    """Perform log2 transformation if it hasn't already been done.

    Args:
        gct (GCToo object)
        prov_code (list of strings)
        prov_code_entry (string)

    Returns:
        out_gct (GCToo object)
        updated_prov_code (list of strings): updated
    """
    # Check if log2 transformation has already occurred
    if prov_code_entry in prov_code:
        logger.info("{} has already occurred.".format(prov_code_entry))
        updated_prov_code = prov_code

        out_gct = gct

    else:
        assert not (gct.data_df < 0).sum().sum(), (
            "data_df should not contain negative values. gct.data_df:\n{}".
            format(gct.data_df))

        out_df = log_transform(gct.data_df, log_base=2)
        updated_prov_code = prov_code + [prov_code_entry]

        # Return new GCToo
        out_gct = GCToo.GCToo(out_df, gct.row_metadata_df, gct.col_metadata_df)

    return out_gct, updated_prov_code
コード例 #19
0
def continuous_renormalization(args):

    # Read in GCT, if path provided, and make deep copies of all DataFrames
    if args.in_gct_path:
        gct = parse.parse(args.in_gct_path)
    else:
        gct = args.in_gct
    data_df = gct.data_df.copy(deep=True)
    row_metadata_df = gct.row_metadata_df.copy(deep=True)
    col_metadata_df = gct.col_metadata_df.copy(deep=True)

    # Remove rows that are all NA values
    data_df = data_df.loc[(data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :]

    # Pull out enrichment scores from column metadata dataframe
    enrichment_scores = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True)

    # Calculate limit as x approaches 1 for non-median normalized data
    pep_y_offsets = calculate_y_offsets(data_df, enrichment_scores)
    
    # Calculate the fit parameters
    fit_parameters = calculate_fit(data_df, enrichment_scores, pep_y_offsets)

    # Annotate which rows will be renormalized based on slope_cutoff argument (default 0.2)
    row_metadata_df["is_log_renormed"] = is_log_renormed(fit_parameters.loc[:, "deg1"].apply(get_slope),
                                                         args.slope_cutoff)
    
    # Calculate the offset matrix
    offset_mat = calculate_peptide_sample_offsets(data_df, row_metadata_df, enrichment_scores, fit_parameters,
                                                  pep_y_offsets)

    # Calculate the output DataFrame
    out_data_df = calculate_out_matrix(data_df, offset_mat)

    # Add the 'renorm_correction' metadata field with total sample offset values
    col_metadata_df["renorm_correction"] = calculate_total_sample_offsets(offset_mat)

    # Output
    if args.write_gct:
        write_gct.write(GCToo.GCToo(data_df=out_data_df,
                                col_metadata_df=col_metadata_df,
                                row_metadata_df=row_metadata_df),
                        args.out_name)
    else:
        return GCToo.GCToo(data_df=out_data_df,
                           col_metadata_df=col_metadata_df,
                           row_metadata_df=row_metadata_df)
コード例 #20
0
ファイル: subset_gctoo.py プロジェクト: zhji0426/cmapPy
def subset_gctoo(gctoo,
                 row_bool=None,
                 col_bool=None,
                 rid=None,
                 cid=None,
                 ridx=None,
                 cidx=None,
                 exclude_rid=None,
                 exclude_cid=None):
    """ Extract a subset of data from a GCToo object in a variety of ways.
    The order of rows and columns will be preserved.

    Args:
        gctoo (GCToo object)
        row_bool (list of bools): length must equal gctoo.data_df.shape[0]
        col_bool (list of bools): length must equal gctoo.data_df.shape[1]
        rid (list of strings): rids to include
        cid (list of strings): cids to include
        ridx (list of integers): row integer ids to include
        cidx (list of integers): col integer ids to include
        exclude_rid (list of strings): rids to exclude
        exclude_cid (list of strings): cids to exclude

    Returns:
        out_gctoo (GCToo object): gctoo after subsetting
    """
    assert sum([
        (rid is not None), (row_bool is not None), (ridx is not None)
    ]) <= 1, ("Only one of rid, row_bool, and ridx can be provided.")
    assert sum([
        (cid is not None), (col_bool is not None), (cidx is not None)
    ]) <= 1, ("Only one of cid, col_bool, and cidx can be provided.")

    # Figure out what rows and columns to keep
    rows_to_keep = get_rows_to_keep(gctoo, rid, row_bool, ridx, exclude_rid)
    cols_to_keep = get_cols_to_keep(gctoo, cid, col_bool, cidx, exclude_cid)

    # Convert labels to boolean array to preserve order
    rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep)
    cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep)

    # Make the output gct
    out_gctoo = GCToo.GCToo(
        src=gctoo.src,
        version=gctoo.version,
        data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools],
        row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :],
        col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :])

    assert out_gctoo.data_df.size > 0, "Subsetting yielded an empty gct!"

    logger.info(
        ("Initial GCToo with {} rows and {} columns subsetted down to " +
         "{} rows and {} columns.").format(gctoo.data_df.shape[0],
                                           gctoo.data_df.shape[1],
                                           out_gctoo.data_df.shape[0],
                                           out_gctoo.data_df.shape[1]))

    return out_gctoo
コード例 #21
0
ファイル: weave.py プロジェクト: cmap/merino
def drop_less_than_2_replicates(modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df):
    """
    For all input data frames, removes signature entries where the number of profiles in that signature (nprofile) = 1.
    Returns filtered data frames.
    """
    sub_mat = modZ_GCT.data_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index, axis=1)
    sub_col_mat = modZ_GCT.col_metadata_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index)
    modZ_GCT = GCToo.GCToo(data_df=sub_mat, col_metadata_df=sub_col_mat, row_metadata_df=modZ_GCT.row_metadata_df)
    cc_q75_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index, inplace=True)

    sub_mat = cb_modZ_GCT.data_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index, axis=1)
    sub_col_mat = cb_modZ_GCT.col_metadata_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index)
    cb_modZ_GCT = GCToo.GCToo(data_df=sub_mat, col_metadata_df=sub_col_mat,
                              row_metadata_df=cb_modZ_GCT.row_metadata_df)
    cb_cc_q75_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index, inplace=True)

    return (modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df)
コード例 #22
0
ファイル: test_slice_gctoo.py プロジェクト: Starlida/cmapPy
 def setUpClass(cls):
     data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]],
                            index=["a", "b", "c", "d"], columns=["e", "f", "g"])
     row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]],
                                    index=["a","b","c","d"], columns=["rhd1", "rh2"])
     col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]],
                                    index=["e", "f", "g"], columns=["chd1", "chd2"])
     cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df)
コード例 #23
0
ファイル: parse_gct.py プロジェクト: yangluom/cmapPy
def create_gctoo_obj(file_path, version, row_metadata_df, col_metadata_df, data_df, make_multiindex):

    # Move dataframes into GCToo object
    gctoo_obj = GCToo.GCToo(src=file_path,
                            version=version,
                            row_metadata_df=row_metadata_df,
                            col_metadata_df=col_metadata_df,
                            data_df=data_df, make_multiindex=make_multiindex)
    return gctoo_obj
コード例 #24
0
ファイル: test_dry.py プロジェクト: karenchris/psp
    def test_p100_calculate_dists_and_apply_offsets_if_needed(self):
        no_optim = True
        offset_bounds = (-2, 2)
        prov_code = ["PR1", "L2X", "filtering"]
        e_dists = [14.75, 0.0, 4.75]
        e_offsets = [3.25, 0.0, -2.25]
        e_prov_code = ["PR1", "L2X", "filtering", "LLB"]

        data = pd.DataFrame([[1, 2, 3],[5, 7, 11], [13, 17, 19], [23, 29, 31]],
                            index=["a", "b", "c", "d"],
                            columns=["e", "f", "g"], dtype=float)
        row_meta = pd.DataFrame([["rm1", "rm2"],["rm3", "rm4"],["rm5", "rm6"],["rm7", "rm8"]],
                                index=["a", "b", "c", "d"],
                                columns=["row_field1", "row_field2"])
        col_meta = pd.DataFrame([["cm1", "cm2"],["cm3", "cm4"],["cm5", "cm6"]],
                                index=["e", "f", "g"],
                                columns=["col_field1", "col_field2"])
        in_gct = GCToo.GCToo(data_df=data, row_metadata_df=row_meta, col_metadata_df=col_meta)

        # P100 & optim
        (out_gct, out_dists, out_offsets, out_prov_code) = (
            dry.p100_calculate_dists_and_apply_offsets_if_needed(
                in_gct, "p100", no_optim_bool=False,
                offset_bounds=offset_bounds, prov_code=prov_code,
                prov_code_entry="LLB"))

        self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2), (
            "out_offsets:\n{}\ne_offsets:\n{}".format(out_offsets, e_offsets)))
        self.assertTrue(np.allclose(out_dists, e_dists, atol=1e-2), (
            "out_dists:\n{}\ne_dists:\n{}".format(out_dists, e_dists)))
        self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2))
        self.assertEqual(out_prov_code, e_prov_code)

        # P100 but no optim
        e_dists2 = [57, 0, 25]
        (out_gct, out_dists, out_offsets, out_prov_code) = (
            dry.p100_calculate_dists_and_apply_offsets_if_needed(
                in_gct, "p100", no_optim_bool=True,
                offset_bounds=offset_bounds, prov_code=prov_code,
                prov_code_entry="LLB"))

        self.assertTrue(np.allclose(out_dists, e_dists2, atol=1e-2), (
            "out_dists:\n{}\ne_dists2:\n{}".format(out_dists, e_dists2)))
        self.assertEqual(out_offsets, None)
        self.assertEqual(out_prov_code, prov_code)

        # GCP
        (out_gct, out_dists, out_offsets, out_prov_code) = (
            dry.p100_calculate_dists_and_apply_offsets_if_needed(
                in_gct, "gcp", no_optim_bool=True,
                offset_bounds=offset_bounds, prov_code=prov_code,
                prov_code_entry="LLB"))

        self.assertEqual(out_dists, None)
        self.assertEqual(out_offsets, None)
        self.assertEqual(out_prov_code, prov_code)
コード例 #25
0
def shear(gctoo, bad_wells):
    remove = gctoo.col_metadata_df[gctoo.col_metadata_df['pert_well'].isin(
        bad_wells)].index
    new_data_df = gctoo.data_df.drop(remove, axis=1)
    new_col_df = gctoo.col_metadata_df.drop(remove)
    new_gctoo = GCToo.GCToo(data_df=new_data_df,
                            col_metadata_df=new_col_df,
                            row_metadata_df=gctoo.row_metadata_df)

    return new_gctoo
コード例 #26
0
def get_plate_qc_data_map_and_run(data_map, metadata_map, norm_cell_metadata, project_name, out_dir, invar):

    for plate in metadata_map['inst']['prism_replicate'].unique():
        print plate
        plate_data_map = {}
        for key in data_map:
            if key == 'count' or key =='mfi':
                plate_data_map[key] = GCToo.GCToo(data_df=data_map[key].data_df[metadata_map['inst'][metadata_map['inst']['prism_replicate'] == plate].index],
                                col_metadata_df=metadata_map['inst'][metadata_map['inst']['prism_replicate'] == plate],
                                row_metadata_df=metadata_map['cell'])
            else:
                plate_data_map[key]= GCToo.GCToo(data_df=data_map[key].data_df.loc[:, [x for x in metadata_map['inst'][
                    metadata_map['inst']['prism_replicate'] == plate].index if x in data_map[key].data_df.columns]],
                                         col_metadata_df=metadata_map['inst'].loc[[x for x in metadata_map['inst'][
                                             metadata_map['inst']['prism_replicate'] == plate].index if x in data_map[key].data_df.columns]],
                                         row_metadata_df=norm_cell_metadata)

        print plate_data_map['norm'].data_df.shape
        plate_summary.plate_qc(out_dir, plate, plate_data_map, invar=invar)
コード例 #27
0
def main(args):

    # Find files
    full_path_wildcard = args.in_dir + args.file_wildcard
    gct_paths = glob.glob(full_path_wildcard)

    assert len(gct_paths) > 1, "full_path_wildcard: {}".format(
        full_path_wildcard)

    # Extract prefixes in order to use them later for saving
    prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0]
                for path in gct_paths]

    for path, prefix in zip(gct_paths, prefixes):
        print "path: {}".format(path)
        print "prefix: {}".format(prefix)

    # Import gcts
    gctoos = [parse(x) for x in gct_paths]

    assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths)

    # Compute & save ranks
    for g, prefix in zip(gctoos, prefixes):

        # Extract data_df
        score_df = g.data_df

        # Must be square
        assert score_df.shape[0] == score_df.shape[
            1], "Input dataframe must be square."

        # Set diagonal to NaN
        np.fill_diagonal(score_df.values, np.nan)

        # Rank the matrix (percentile score or not)
        if args.do_percentile_rank:
            rank_df = score_df.rank(ascending=False, pct=True) * 100
        else:
            rank_df = score_df.rank(ascending=False)

        # Make a GCToo
        rank_gctoo = GCToo.GCToo(data_df=rank_df,
                                 row_metadata_df=g.row_metadata_df,
                                 col_metadata_df=g.col_metadata_df)

        # Save the rank_df to file
        out_name = args.out_dir + prefix + args.output_suffix
        wg.write(rank_gctoo,
                 out_name,
                 filler_null="NaN",
                 data_null="NaN",
                 metadata_null="NaN")
コード例 #28
0
ファイル: test_dry.py プロジェクト: yuanjun/psp
    def test_p100_filter_samples_by_dist(self):
        offsets = np.array([4, 3, 7], dtype=float)
        dists = np.array([1, 6, 2], dtype=float)
        dist_sd_cutoff = 1
        prov_code = ["A", "B"]
        data = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                            index=["a", "b", "c", "d"],
                            columns=["e", "f", "g"])
        row_meta = pd.DataFrame(
            [["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]],
            index=["a", "b", "c", "d"],
            columns=["row_field1", "row_field2"])
        col_meta = pd.DataFrame(
            [["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]],
            index=["e", "f", "g"],
            columns=["col_field1", "col_field2"])
        in_gct = GCToo.GCToo(data_df=data,
                             row_metadata_df=row_meta,
                             col_metadata_df=col_meta)

        # P100
        e_data = pd.DataFrame([[1, 3], [4, 6], [7, 9], [10, 12]],
                              index=["a", "b", "c", "d"],
                              columns=["e", "g"])
        e_col_meta = pd.DataFrame([["cm1", "cm2"], ["cm5", "cm6"]],
                                  index=["e", "g"],
                                  columns=["col_field1", "col_field2"])
        e_offsets = np.array([4, 7], dtype=float)
        e_remaining = ["e", "g"]
        e_prov_code = ["A", "B", "OSF1"]

        (out_gct, out_offsets, out_remaining,
         out_prov_code) = (dry.p100_filter_samples_by_dist(
             in_gct, "p100", offsets, dists, dist_sd_cutoff, prov_code, "OSF"))

        self.assertTrue(np.allclose(out_gct.data_df, e_data, atol=1e-2))
        self.assertTrue(np.array_equal(out_gct.col_metadata_df, e_col_meta))
        self.assertTrue(np.array_equal(out_gct.row_metadata_df, row_meta))
        self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2))
        self.assertEqual(out_remaining, e_remaining)
        self.assertEqual(out_prov_code, e_prov_code)

        # GCP
        (out_gct2, out_offsets2, out_remaining2,
         out_prov_code2) = (dry.p100_filter_samples_by_dist(
             in_gct, "gcp", None, dists, dist_sd_cutoff, prov_code, "OSF"))

        self.assertTrue(np.allclose(out_gct2.data_df, data, atol=1e-2))
        self.assertTrue(np.array_equal(out_gct2.col_metadata_df, col_meta))
        self.assertTrue(np.array_equal(out_gct2.row_metadata_df, row_meta))
        self.assertEqual(out_offsets2, None)
        self.assertEqual(out_remaining2, None)
        self.assertEqual(out_prov_code2, prov_code)
コード例 #29
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True)

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_df, bg_df) = prepare_multi_index_dfs(
        test_gct.multi_index_df, bg_gct.multi_index_df,
        args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct,
        QUERY_FIELD_NAME,
        TARGET_FIELD_NAME,
        args.separator)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df)

    # Compute connectivity
    (conn_mi_df, signed_conn_mi_df) = compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        args.connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = (
        GCToo.multi_index_df_to_component_dfs(
            signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME))

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)

    # Create gct and write it to file
    conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df)
    wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
コード例 #30
0
ファイル: external_query.py プロジェクト: karenchris/psp
def do_steep_and_sip(external_gct, internal_gct, bg_gct, similarity_metric,
                     connectivity_metric,
                     fields_to_aggregate_for_external_profiles,
                     fields_to_aggregate_for_internal_profiles):

    #----------STEEP----------#

    # Compute similarity between external and internal profiles
    sim_df = steep.compute_similarity_bw_two_dfs(internal_gct.data_df,
                                                 external_gct.data_df,
                                                 similarity_metric)

    # Row metadata is from gct1, column metadata is from gct2
    row_metadata_for_sim_df = internal_gct.col_metadata_df
    col_metadata_for_sim_df = external_gct.col_metadata_df

    # Append column to both metadata_dfs indicating which similarity_metric was used
    row_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric
    col_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric

    # Assemble similarity gct
    sim_gct = GCToo.GCToo(sim_df, row_metadata_for_sim_df,
                          col_metadata_for_sim_df)

    #----------SIP----------#

    # Check symmetry
    (is_test_df_sym,
     is_bg_df_sym) = sip.check_symmetry(sim_gct.data_df, bg_gct.data_df)

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_gct, bg_gct) = sip.create_aggregated_fields_in_GCTs(
        sim_gct, bg_gct, fields_to_aggregate_for_external_profiles,
        fields_to_aggregate_for_internal_profiles,
        fields_to_aggregate_for_internal_profiles, QUERY_FIELD_NAME,
        TARGET_FIELD_NAME, SEPARATOR)

    # Compute connectivity
    (_, signed_conn_gct) = sip.compute_connectivities(
        test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME,
        TARGET_FIELD_NAME, connectivity_metric, is_test_df_sym, SEPARATOR)

    # Append to queries a new column saying what connectivity metric was used
    sip.add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)
    sip.add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df,
                                            connectivity_metric,
                                            CONNECTIVITY_METRIC_FIELD)

    return sim_gct, signed_conn_gct